From 287380010f8d75ba08a4db09d7fc5b481d4bf87b Mon Sep 17 00:00:00 2001 From: pvanhout Date: Thu, 21 Aug 2025 12:14:21 +0200 Subject: [PATCH 1/5] [AMDGPU][gfx1250] Implement SIMemoryLegalizer Implements the base of the MemoryLegalizer for a roughly correct GFX1250 memory model. Documentation will come later, and some remaining changes still have to be added, but this is the backbone of the model. --- llvm/lib/Target/AMDGPU/GCNSubtarget.h | 4 + llvm/lib/Target/AMDGPU/SIInstrInfo.h | 2 + llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp | 73 +- llvm/lib/Target/AMDGPU/SOPInstructions.td | 5 + .../AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll | 22 + .../CodeGen/AMDGPU/atomics-system-scope.ll | 8 + .../test/CodeGen/AMDGPU/flat-saddr-atomics.ll | 292 +- .../CodeGen/AMDGPU/fp64-atomics-gfx90a.ll | 25 + .../memory-legalizer-fence-mmra-global.ll | 242 +- .../CodeGen/AMDGPU/memory-legalizer-fence.ll | 246 +- .../AMDGPU/memory-legalizer-flat-agent.ll | 2964 ++++++++-------- .../AMDGPU/memory-legalizer-flat-lastuse.ll | 22 +- .../memory-legalizer-flat-nontemporal.ll | 22 +- .../memory-legalizer-flat-singlethread.ll | 2220 ++++++------ .../AMDGPU/memory-legalizer-flat-system.ll | 3022 ++++++++--------- .../AMDGPU/memory-legalizer-flat-volatile.ll | 74 +- .../AMDGPU/memory-legalizer-flat-wavefront.ll | 2189 ++++++------ .../AMDGPU/memory-legalizer-flat-workgroup.ll | 2330 +++++++------ .../AMDGPU/memory-legalizer-global-agent.ll | 2888 ++++++++-------- .../AMDGPU/memory-legalizer-global-lastuse.ll | 22 +- .../memory-legalizer-global-nontemporal.ll | 22 +- .../memory-legalizer-global-singlethread.ll | 2220 ++++++------ .../AMDGPU/memory-legalizer-global-system.ll | 2789 ++++++++------- .../memory-legalizer-global-volatile.ll | 74 +- .../memory-legalizer-global-wavefront.ll | 2220 ++++++------ .../memory-legalizer-global-workgroup.ll | 2442 +++++++------ .../AMDGPU/memory-legalizer-local-agent.ll | 824 ++--- .../AMDGPU/memory-legalizer-local-system.ll | 824 ++--- .../AMDGPU/memory-legalizer-local-volatile.ll | 21 +- .../memory-legalizer-local-workgroup.ll | 824 ++--- 30 files changed, 14856 insertions(+), 14076 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h index 4475c8d1d1602..556ec683f2ec6 100644 --- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h +++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h @@ -1835,6 +1835,10 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo, bool hasScratchBaseForwardingHazard() const { return GFX1250Insts && getGeneration() == GFX12; } + + /// \returns true if the subtarget requires a wait for xcnt before atomic + /// flat/global stores & rmw. + bool requiresWaitXCntBeforeAtomicStores() const { return GFX1250Insts; } }; class GCNUserSGPRUsageInfo { diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h index c964d02ee2b97..f7dde2b90b68e 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h @@ -1055,6 +1055,8 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo { return AMDGPU::S_WAIT_DSCNT; case AMDGPU::S_WAIT_KMCNT_soft: return AMDGPU::S_WAIT_KMCNT; + case AMDGPU::S_WAIT_XCNT_soft: + return AMDGPU::S_WAIT_XCNT; default: return Opcode; } diff --git a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp index 6ab4eb4bde97c..95fa03fc97e69 100644 --- a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp +++ b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp @@ -606,7 +606,11 @@ class SIGfx12CacheControl : public SIGfx11CacheControl { SIAtomicScope Scope, SIAtomicAddrSpace AddrSpace) const; public: - SIGfx12CacheControl(const GCNSubtarget &ST) : SIGfx11CacheControl(ST) {} + SIGfx12CacheControl(const GCNSubtarget &ST) : SIGfx11CacheControl(ST) { + // GFX12.0 and GFX12.5 memory models greatly overlap, and in some cases + // the behavior is the same if assuming GFX12.0 in CU mode. + assert(ST.hasGFX1250Insts() ? ST.isCuModeEnabled() : true); + } bool insertWait(MachineBasicBlock::iterator &MI, SIAtomicScope Scope, SIAtomicAddrSpace AddrSpace, SIMemOp Op, @@ -2378,12 +2382,16 @@ bool SIGfx12CacheControl::insertWait(MachineBasicBlock::iterator &MI, STORECnt |= true; break; case SIAtomicScope::WORKGROUP: - // In WGP mode the waves of a work-group can be executing on either CU of - // the WGP. Therefore need to wait for operations to complete to ensure - // they are visible to waves in the other CU as the L0 is per CU. - // Otherwise in CU mode and all waves of a work-group are on the same CU - // which shares the same L0. - if (!ST.isCuModeEnabled()) { + // GFX12.0: + // In WGP mode the waves of a work-group can be executing on either CU + // of the WGP. Therefore need to wait for operations to complete to + // ensure they are visible to waves in the other CU as the L0 is per CU. + // Otherwise in CU mode and all waves of a work-group are on the same CU + // which shares the same L0. + // + // GFX12.5: + // TODO DOCS + if (!ST.isCuModeEnabled() || ST.hasGFX1250Insts()) { if ((Op & SIMemOp::LOAD) != SIMemOp::NONE) LOADCnt |= true; if ((Op & SIMemOp::STORE) != SIMemOp::NONE) @@ -2404,7 +2412,7 @@ bool SIGfx12CacheControl::insertWait(MachineBasicBlock::iterator &MI, switch (Scope) { case SIAtomicScope::SYSTEM: case SIAtomicScope::AGENT: - case SIAtomicScope::WORKGROUP: + case SIAtomicScope::WORKGROUP: // If no cross address space ordering then an "S_WAITCNT lgkmcnt(0)" is // not needed as LDS operations for all waves are executed in a total // global ordering as observed by all waves. Required if also @@ -2435,7 +2443,7 @@ bool SIGfx12CacheControl::insertWait(MachineBasicBlock::iterator &MI, // // This also applies to fences. Fences cannot pair with an instruction // tracked with bvh/samplecnt as we don't have any atomics that do that. - if (Order != AtomicOrdering::Acquire) { + if (Order != AtomicOrdering::Acquire && ST.hasImageInsts()) { BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAIT_BVHCNT_soft)).addImm(0); BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAIT_SAMPLECNT_soft)).addImm(0); } @@ -2487,10 +2495,14 @@ bool SIGfx12CacheControl::insertAcquire(MachineBasicBlock::iterator &MI, ScopeImm = AMDGPU::CPol::SCOPE_DEV; break; case SIAtomicScope::WORKGROUP: - // In WGP mode the waves of a work-group can be executing on either CU of - // the WGP. Therefore we need to invalidate the L0 which is per CU. - // Otherwise in CU mode all waves of a work-group are on the same CU, and so - // the L0 does not need to be invalidated. + // GFX12.0: + // In WGP mode the waves of a work-group can be executing on either CU of + // the WGP. Therefore we need to invalidate the L0 which is per CU. + // Otherwise in CU mode all waves of a work-group are on the same CU, and + // so the L0 does not need to be invalidated. + // + // GFX12.5 + // TODO DOCS if (ST.isCuModeEnabled()) return false; @@ -2535,7 +2547,8 @@ bool SIGfx12CacheControl::insertRelease(MachineBasicBlock::iterator &MI, if (Pos == Position::AFTER) ++MI; - // global_wb is only necessary at system scope for gfx120x targets. + // global_wb is only necessary at system scope for GFX12.0, + // they're also necessary at device scope for GFX12.5. // // Emitting it for lower scopes is a slow no-op, so we omit it // for performance. @@ -2545,6 +2558,12 @@ bool SIGfx12CacheControl::insertRelease(MachineBasicBlock::iterator &MI, .addImm(AMDGPU::CPol::SCOPE_SYS); break; case SIAtomicScope::AGENT: + // TODO DOCS + if (ST.hasGFX1250Insts()) { + BuildMI(MBB, MI, DL, TII->get(AMDGPU::GLOBAL_WB)) + .addImm(AMDGPU::CPol::SCOPE_DEV); + } + break; case SIAtomicScope::WORKGROUP: // No WB necessary, but we still have to wait. break; @@ -2607,17 +2626,31 @@ bool SIGfx12CacheControl::enableVolatileAndOrNonTemporal( } bool SIGfx12CacheControl::finalizeStore(MachineInstr &MI, bool Atomic) const { - MachineOperand *CPol = TII->getNamedOperand(MI, OpName::cpol); - if (!CPol) - return false; + assert(MI.mayStore() && "Not a Store inst"); + const bool IsRMW = (MI.mayLoad() && MI.mayStore()); + bool Changed = false; + + // GFX12.5 only: xcnt wait is needed before flat and global atomics stores/rmw + if (Atomic && ST.requiresWaitXCntBeforeAtomicStores() && TII->isFLAT(MI)) { + MachineBasicBlock &MBB = *MI.getParent(); + BuildMI(MBB, MI, MI.getDebugLoc(), TII->get(S_WAIT_XCNT_soft)).addImm(0); + Changed = true; + } + + // Remaining fixes do not apply to RMWs + if (IsRMW) + return Changed; + MachineOperand *CPol = TII->getNamedOperand(MI, OpName::cpol); + if (!CPol) // Some vmem operations do not have a scope and are not concerned. + return Changed; const unsigned Scope = CPol->getImm() & CPol::SCOPE; // GFX12.0 only: Extra waits needed before system scope stores. if (!ST.hasGFX1250Insts()) { if (!Atomic && Scope == CPol::SCOPE_SYS) return insertWaitsBeforeSystemScopeStore(MI); - return false; + return Changed; } // GFX12.5 only: Require SCOPE_SE on stores that may hit the scratch address @@ -2627,7 +2660,7 @@ bool SIGfx12CacheControl::finalizeStore(MachineInstr &MI, bool Atomic) const { (!ST.hasCUStores() || TII->mayAccessScratchThroughFlat(MI))) return setScope(MI, CPol::SCOPE_SE); - return false; + return Changed; } bool SIGfx12CacheControl::handleCooperativeAtomic(MachineInstr &MI) const { @@ -2839,6 +2872,7 @@ bool SIMemoryLegalizer::expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI, assert(MI->mayLoad() && MI->mayStore()); bool Changed = false; + MachineInstr &RMWMI = *MI; if (MOI.isAtomic()) { const AtomicOrdering Order = MOI.getOrdering(); @@ -2873,6 +2907,7 @@ bool SIMemoryLegalizer::expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI, Position::AFTER); } + Changed |= CC->finalizeStore(RMWMI, /*Atomic=*/true); return Changed; } diff --git a/llvm/lib/Target/AMDGPU/SOPInstructions.td b/llvm/lib/Target/AMDGPU/SOPInstructions.td index dbe0b8c496fed..e170268b47c44 100644 --- a/llvm/lib/Target/AMDGPU/SOPInstructions.td +++ b/llvm/lib/Target/AMDGPU/SOPInstructions.td @@ -1653,6 +1653,11 @@ let OtherPredicates = [HasImageInsts] in { def S_WAIT_KMCNT_soft : SOPP_Pseudo <"s_soft_wait_kmcnt", (ins s16imm:$simm16), "$simm16">; } + +let SubtargetPredicate = HasWaitXcnt in { + def S_WAIT_XCNT_soft : SOPP_Pseudo<"s_soft_wait_xcnt", (ins s16imm:$simm16), "$simm16">; +} + // Represents the point at which a wave must wait for all outstanding direct loads to LDS. // Typically inserted by the memory legalizer and consumed by SIInsertWaitcnts. diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll index 481a2540eacb7..e886ea4fc6ac6 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll @@ -1501,6 +1501,7 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat(ptr addrspace(1) %pt ; GFX1250-NEXT: v_mul_f64_e32 v[0:1], 4.0, v[0:1] ; GFX1250-NEXT: global_wb scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 @@ -1571,6 +1572,9 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_agent(ptr addrspace( ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-NEXT: v_mul_f64_e32 v[0:1], 4.0, v[0:1] +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 @@ -1645,6 +1649,7 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_system(ptr addrspace ; GFX1250-NEXT: v_mul_f64_e32 v[0:1], 4.0, v[0:1] ; GFX1250-NEXT: global_wb scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 @@ -1715,6 +1720,9 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_flush(ptr addrspace( ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-NEXT: v_mul_f64_e32 v[0:1], 4.0, v[0:1] +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 @@ -1792,6 +1800,7 @@ define double @global_atomic_fadd_f64_rtn_pat_agent(ptr addrspace(1) %ptr, doubl ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b64_e32 v[2:3], 4.0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_atomic_add_f64 v[0:1], v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt 0x0 @@ -1902,6 +1911,9 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_agent_safe(ptr addrs ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-NEXT: v_mul_f64_e32 v[0:1], 4.0, v[0:1] +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 @@ -1947,6 +1959,7 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat(ptr %ptr) #1 { ; GFX1250-NEXT: v_mov_b32_e32 v2, 0 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: flat_atomic_add_f64 v2, v[0:1], s[0:1] scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 @@ -1987,6 +2000,9 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_agent(ptr %ptr) #1 { ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1250-NEXT: v_mov_b64_e32 v[0:1], 4.0 ; GFX1250-NEXT: v_mov_b32_e32 v2, 0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: flat_atomic_add_f64 v2, v[0:1], s[0:1] scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 @@ -2031,6 +2047,7 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_system(ptr %ptr) #1 { ; GFX1250-NEXT: v_mov_b32_e32 v2, 0 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: flat_atomic_add_f64 v2, v[0:1], s[0:1] scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 @@ -2107,6 +2124,7 @@ define double @flat_atomic_fadd_f64_rtn_pat_agent(ptr %ptr) #1 { ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b64_e32 v[2:3], 4.0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: flat_atomic_add_f64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -2190,6 +2208,9 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_agent_safe(ptr %ptr) { ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1250-NEXT: v_mov_b64_e32 v[0:1], 4.0 ; GFX1250-NEXT: v_mov_b32_e32 v2, 0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: flat_atomic_add_f64 v2, v[0:1], s[0:1] scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 @@ -2418,6 +2439,7 @@ define double @local_atomic_fadd_f64_rtn_pat(ptr addrspace(3) %ptr, double %data ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b64_e32 v[2:3], 4.0 +; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: ds_add_rtn_f64 v[0:1], v0, v[2:3] ; GFX1250-NEXT: s_wait_dscnt 0x0 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/atomics-system-scope.ll b/llvm/test/CodeGen/AMDGPU/atomics-system-scope.ll index 5fc9f4a0f8038..4bb2a13d02cc7 100644 --- a/llvm/test/CodeGen/AMDGPU/atomics-system-scope.ll +++ b/llvm/test/CodeGen/AMDGPU/atomics-system-scope.ll @@ -364,6 +364,7 @@ define i16 @global_one_as_atomic_min_i16(ptr addrspace(1) %ptr, i16 %val) { ; GFX1250-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-NEXT: v_and_or_b32 v6, v7, v4, v5 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[6:7], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v7 @@ -406,6 +407,7 @@ define i16 @global_one_as_atomic_umin_i16(ptr addrspace(1) %ptr, i16 %val) { ; GFX1250-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-NEXT: v_and_or_b32 v6, v7, v4, v5 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[6:7], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v7 @@ -448,6 +450,7 @@ define i16 @global_one_as_atomic_max_i16(ptr addrspace(1) %ptr, i16 %val) { ; GFX1250-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-NEXT: v_and_or_b32 v6, v7, v4, v5 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[6:7], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v7 @@ -490,6 +493,7 @@ define i16 @global_one_as_atomic_umax_i16(ptr addrspace(1) %ptr, i16 %val) { ; GFX1250-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-NEXT: v_and_or_b32 v6, v7, v4, v5 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[6:7], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v7 @@ -1344,6 +1348,7 @@ define i16 @flat_one_as_atomic_min_i16(ptr %ptr, i16 %val) { ; GFX1250-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-NEXT: v_and_or_b32 v6, v7, v4, v5 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[6:7] th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v7 @@ -1386,6 +1391,7 @@ define i16 @flat_one_as_atomic_umin_i16(ptr %ptr, i16 %val) { ; GFX1250-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-NEXT: v_and_or_b32 v6, v7, v4, v5 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[6:7] th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v7 @@ -1428,6 +1434,7 @@ define i16 @flat_one_as_atomic_max_i16(ptr %ptr, i16 %val) { ; GFX1250-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-NEXT: v_and_or_b32 v6, v7, v4, v5 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[6:7] th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v7 @@ -1470,6 +1477,7 @@ define i16 @flat_one_as_atomic_umax_i16(ptr %ptr, i16 %val) { ; GFX1250-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-NEXT: v_and_or_b32 v6, v7, v4, v5 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[6:7] th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v7 diff --git a/llvm/test/CodeGen/AMDGPU/flat-saddr-atomics.ll b/llvm/test/CodeGen/AMDGPU/flat-saddr-atomics.ll index 3dedf008c917e..62129ebe40358 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-saddr-atomics.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-saddr-atomics.ll @@ -10,6 +10,8 @@ define amdgpu_ps void @flat_xchg_saddr_i32_nortn(ptr inreg %sbase, i32 %voffset, i32 %data) { ; GFX1250-LABEL: flat_xchg_saddr_i32_nortn: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[2:3] scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV @@ -47,6 +49,8 @@ define amdgpu_ps void @flat_xchg_saddr_i32_nortn(ptr inreg %sbase, i32 %voffset, define amdgpu_ps void @flat_xchg_saddr_i32_nortn_offset_2047(ptr inreg %sbase, i32 %voffset, i32 %data) { ; GFX1250-LABEL: flat_xchg_saddr_i32_nortn_offset_2047: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[2:3] offset:2047 scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV @@ -85,6 +89,8 @@ define amdgpu_ps void @flat_xchg_saddr_i32_nortn_offset_2047(ptr inreg %sbase, i define amdgpu_ps void @flat_xchg_saddr_i32_nortn_offset_neg2048(ptr inreg %sbase, i32 %voffset, i32 %data) { ; GFX1250-LABEL: flat_xchg_saddr_i32_nortn_offset_neg2048: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[2:3] offset:-2048 scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV @@ -128,6 +134,8 @@ define amdgpu_ps void @flat_xchg_saddr_i32_nortn_offset_neg2048(ptr inreg %sbase define amdgpu_ps float @flat_xchg_saddr_i32_rtn(ptr inreg %sbase, i32 %voffset, i32 %data) { ; GFX1250-LABEL: flat_xchg_saddr_i32_rtn: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v0, v0, v1, s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV @@ -166,6 +174,8 @@ define amdgpu_ps float @flat_xchg_saddr_i32_rtn(ptr inreg %sbase, i32 %voffset, define amdgpu_ps float @flat_xchg_saddr_i32_rtn_2048(ptr inreg %sbase, i32 %voffset, i32 %data) { ; GFX1250-LABEL: flat_xchg_saddr_i32_rtn_2048: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v0, v0, v1, s[2:3] offset:2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV @@ -205,6 +215,8 @@ define amdgpu_ps float @flat_xchg_saddr_i32_rtn_2048(ptr inreg %sbase, i32 %voff define amdgpu_ps float @flat_xchg_saddr_i32_rtn_neg2048(ptr inreg %sbase, i32 %voffset, i32 %data) { ; GFX1250-LABEL: flat_xchg_saddr_i32_rtn_neg2048: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v0, v0, v1, s[2:3] offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV @@ -262,6 +274,8 @@ define amdgpu_ps float @flat_xchg_saddr_uniform_ptr_in_vgprs_rtn(i32 %voffset, i ; GFX1250-SDAG-NEXT: s_wait_dscnt 0x0 ; GFX1250-SDAG-NEXT: v_readfirstlane_b32 s0, v2 ; GFX1250-SDAG-NEXT: v_readfirstlane_b32 s1, v3 +; GFX1250-SDAG-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0 ; GFX1250-SDAG-NEXT: flat_atomic_swap_b32 v0, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: global_inv scope:SCOPE_DEV @@ -276,6 +290,8 @@ define amdgpu_ps float @flat_xchg_saddr_uniform_ptr_in_vgprs_rtn(i32 %voffset, i ; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, v2, v0 ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo +; GFX1250-GISEL-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0 ; GFX1250-GISEL-NEXT: flat_atomic_swap_b32 v0, v[2:3], v1 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV @@ -325,6 +341,8 @@ define amdgpu_ps float @flat_xchg_saddr_uniform_ptr_in_vgprs_rtn_immoffset(i32 % ; GFX1250-SDAG-NEXT: s_wait_dscnt 0x0 ; GFX1250-SDAG-NEXT: v_readfirstlane_b32 s0, v2 ; GFX1250-SDAG-NEXT: v_readfirstlane_b32 s1, v3 +; GFX1250-SDAG-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0 ; GFX1250-SDAG-NEXT: flat_atomic_swap_b32 v0, v0, v1, s[0:1] offset:42 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: global_inv scope:SCOPE_DEV @@ -339,6 +357,8 @@ define amdgpu_ps float @flat_xchg_saddr_uniform_ptr_in_vgprs_rtn_immoffset(i32 % ; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, v2, v0 ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo +; GFX1250-GISEL-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0 ; GFX1250-GISEL-NEXT: flat_atomic_swap_b32 v0, v[2:3], v1 offset:42 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV @@ -389,6 +409,8 @@ define amdgpu_ps void @flat_xchg_saddr_uniform_ptr_in_vgprs_nortn(i32 %voffset, ; GFX1250-SDAG-NEXT: s_wait_dscnt 0x0 ; GFX1250-SDAG-NEXT: v_readfirstlane_b32 s0, v2 ; GFX1250-SDAG-NEXT: v_readfirstlane_b32 s1, v3 +; GFX1250-SDAG-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0 ; GFX1250-SDAG-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV ; GFX1250-SDAG-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: global_inv scope:SCOPE_DEV @@ -402,6 +424,8 @@ define amdgpu_ps void @flat_xchg_saddr_uniform_ptr_in_vgprs_nortn(i32 %voffset, ; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, v2, v0 ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo +; GFX1250-GISEL-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0 ; GFX1250-GISEL-NEXT: flat_atomic_swap_b32 v[2:3], v1 scope:SCOPE_DEV ; GFX1250-GISEL-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV @@ -449,6 +473,8 @@ define amdgpu_ps void @flat_xchg_saddr_uniform_ptr_in_vgprs_nortn_immoffset(i32 ; GFX1250-SDAG-NEXT: s_wait_dscnt 0x0 ; GFX1250-SDAG-NEXT: v_readfirstlane_b32 s0, v2 ; GFX1250-SDAG-NEXT: v_readfirstlane_b32 s1, v3 +; GFX1250-SDAG-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0 ; GFX1250-SDAG-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] offset:42 scope:SCOPE_DEV ; GFX1250-SDAG-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: global_inv scope:SCOPE_DEV @@ -462,6 +488,8 @@ define amdgpu_ps void @flat_xchg_saddr_uniform_ptr_in_vgprs_nortn_immoffset(i32 ; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, v2, v0 ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo +; GFX1250-GISEL-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0 ; GFX1250-GISEL-NEXT: flat_atomic_swap_b32 v[2:3], v1 offset:42 scope:SCOPE_DEV ; GFX1250-GISEL-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV @@ -532,6 +560,8 @@ define amdgpu_ps <2 x float> @flat_xchg_saddr_i64_rtn(ptr inreg %sbase, i32 %vof ; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX1250-SDAG-NEXT: s_branch .LBB10_5 ; GFX1250-SDAG-NEXT: .LBB10_3: ; %atomicrmw.global +; GFX1250-SDAG-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0 ; GFX1250-SDAG-NEXT: flat_atomic_swap_b64 v[0:1], v[4:5], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: global_inv scope:SCOPE_DEV @@ -578,6 +608,8 @@ define amdgpu_ps <2 x float> @flat_xchg_saddr_i64_rtn(ptr inreg %sbase, i32 %vof ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX1250-GISEL-NEXT: s_branch .LBB10_5 ; GFX1250-GISEL-NEXT: .LBB10_3: ; %atomicrmw.global +; GFX1250-GISEL-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0 ; GFX1250-GISEL-NEXT: flat_atomic_swap_b64 v[0:1], v3, v[4:5], s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV @@ -712,6 +744,8 @@ define amdgpu_ps <2 x float> @flat_xchg_saddr_i64_rtn_neg128(ptr inreg %sbase, i ; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX1250-SDAG-NEXT: s_branch .LBB11_5 ; GFX1250-SDAG-NEXT: .LBB11_3: ; %atomicrmw.global +; GFX1250-SDAG-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0 ; GFX1250-SDAG-NEXT: flat_atomic_swap_b64 v[0:1], v[4:5], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: global_inv scope:SCOPE_DEV @@ -761,6 +795,8 @@ define amdgpu_ps <2 x float> @flat_xchg_saddr_i64_rtn_neg128(ptr inreg %sbase, i ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX1250-GISEL-NEXT: s_branch .LBB11_5 ; GFX1250-GISEL-NEXT: .LBB11_3: ; %atomicrmw.global +; GFX1250-GISEL-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0 ; GFX1250-GISEL-NEXT: flat_atomic_swap_b64 v[0:1], v3, v[4:5], s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV @@ -896,6 +932,8 @@ define amdgpu_ps void @flat_xchg_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-SDAG-NEXT: .LBB12_2: ; %atomicrmw.phi ; GFX1250-SDAG-NEXT: s_endpgm ; GFX1250-SDAG-NEXT: .LBB12_3: ; %atomicrmw.global +; GFX1250-SDAG-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0 ; GFX1250-SDAG-NEXT: flat_atomic_swap_b64 v[0:1], v[2:3] scope:SCOPE_DEV ; GFX1250-SDAG-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: global_inv scope:SCOPE_DEV @@ -933,6 +971,8 @@ define amdgpu_ps void @flat_xchg_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-GISEL-NEXT: .LBB12_2: ; %atomicrmw.phi ; GFX1250-GISEL-NEXT: s_endpgm ; GFX1250-GISEL-NEXT: .LBB12_3: ; %atomicrmw.global +; GFX1250-GISEL-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0 ; GFX1250-GISEL-NEXT: flat_atomic_swap_b64 v0, v[4:5], s[2:3] scope:SCOPE_DEV ; GFX1250-GISEL-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV @@ -1044,6 +1084,8 @@ define amdgpu_ps void @flat_xchg_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %v ; GFX1250-SDAG-NEXT: .LBB13_2: ; %atomicrmw.phi ; GFX1250-SDAG-NEXT: s_endpgm ; GFX1250-SDAG-NEXT: .LBB13_3: ; %atomicrmw.global +; GFX1250-SDAG-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0 ; GFX1250-SDAG-NEXT: flat_atomic_swap_b64 v[0:1], v[2:3] scope:SCOPE_DEV ; GFX1250-SDAG-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: global_inv scope:SCOPE_DEV @@ -1084,6 +1126,8 @@ define amdgpu_ps void @flat_xchg_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %v ; GFX1250-GISEL-NEXT: .LBB13_2: ; %atomicrmw.phi ; GFX1250-GISEL-NEXT: s_endpgm ; GFX1250-GISEL-NEXT: .LBB13_3: ; %atomicrmw.global +; GFX1250-GISEL-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0 ; GFX1250-GISEL-NEXT: flat_atomic_swap_b64 v0, v[4:5], s[2:3] offset:-128 scope:SCOPE_DEV ; GFX1250-GISEL-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV @@ -1186,6 +1230,8 @@ define amdgpu_ps void @flat_xchg_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %v define amdgpu_ps float @flat_add_saddr_i32_rtn(ptr inreg %sbase, i32 %voffset, i32 %data) { ; GFX1250-LABEL: flat_add_saddr_i32_rtn: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: flat_atomic_add_u32 v0, v0, v1, s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV @@ -1224,6 +1270,8 @@ define amdgpu_ps float @flat_add_saddr_i32_rtn(ptr inreg %sbase, i32 %voffset, i define amdgpu_ps float @flat_add_saddr_i32_rtn_neg128(ptr inreg %sbase, i32 %voffset, i32 %data) { ; GFX1250-LABEL: flat_add_saddr_i32_rtn_neg128: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: flat_atomic_add_u32 v0, v0, v1, s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV @@ -1269,6 +1317,8 @@ define amdgpu_ps float @flat_add_saddr_i32_rtn_neg128(ptr inreg %sbase, i32 %vof define amdgpu_ps void @flat_add_saddr_i32_nortn(ptr inreg %sbase, i32 %voffset, i32 %data) { ; GFX1250-LABEL: flat_add_saddr_i32_nortn: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: flat_atomic_add_u32 v0, v1, s[2:3] scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV @@ -1305,6 +1355,8 @@ define amdgpu_ps void @flat_add_saddr_i32_nortn(ptr inreg %sbase, i32 %voffset, define amdgpu_ps void @flat_add_saddr_i32_nortn_neg128(ptr inreg %sbase, i32 %voffset, i32 %data) { ; GFX1250-LABEL: flat_add_saddr_i32_nortn_neg128: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: flat_atomic_add_u32 v0, v1, s[2:3] offset:-128 scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV @@ -1368,6 +1420,8 @@ define amdgpu_ps <2 x float> @flat_add_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX1250-SDAG-NEXT: s_branch .LBB18_5 ; GFX1250-SDAG-NEXT: .LBB18_3: ; %atomicrmw.global +; GFX1250-SDAG-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0 ; GFX1250-SDAG-NEXT: flat_atomic_add_u64 v[0:1], v[4:5], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: global_inv scope:SCOPE_DEV @@ -1414,6 +1468,8 @@ define amdgpu_ps <2 x float> @flat_add_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX1250-GISEL-NEXT: s_branch .LBB18_5 ; GFX1250-GISEL-NEXT: .LBB18_3: ; %atomicrmw.global +; GFX1250-GISEL-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0 ; GFX1250-GISEL-NEXT: flat_atomic_add_u64 v[0:1], v3, v[4:5], s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV @@ -1552,6 +1608,8 @@ define amdgpu_ps <2 x float> @flat_add_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX1250-SDAG-NEXT: s_branch .LBB19_5 ; GFX1250-SDAG-NEXT: .LBB19_3: ; %atomicrmw.global +; GFX1250-SDAG-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0 ; GFX1250-SDAG-NEXT: flat_atomic_add_u64 v[0:1], v[4:5], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: global_inv scope:SCOPE_DEV @@ -1601,6 +1659,8 @@ define amdgpu_ps <2 x float> @flat_add_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX1250-GISEL-NEXT: s_branch .LBB19_5 ; GFX1250-GISEL-NEXT: .LBB19_3: ; %atomicrmw.global +; GFX1250-GISEL-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0 ; GFX1250-GISEL-NEXT: flat_atomic_add_u64 v[0:1], v3, v[4:5], s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV @@ -1740,6 +1800,8 @@ define amdgpu_ps void @flat_add_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-SDAG-NEXT: .LBB20_2: ; %atomicrmw.phi ; GFX1250-SDAG-NEXT: s_endpgm ; GFX1250-SDAG-NEXT: .LBB20_3: ; %atomicrmw.global +; GFX1250-SDAG-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0 ; GFX1250-SDAG-NEXT: flat_atomic_add_u64 v[0:1], v[2:3] scope:SCOPE_DEV ; GFX1250-SDAG-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: global_inv scope:SCOPE_DEV @@ -1780,6 +1842,8 @@ define amdgpu_ps void @flat_add_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-GISEL-NEXT: .LBB20_2: ; %atomicrmw.phi ; GFX1250-GISEL-NEXT: s_endpgm ; GFX1250-GISEL-NEXT: .LBB20_3: ; %atomicrmw.global +; GFX1250-GISEL-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0 ; GFX1250-GISEL-NEXT: flat_atomic_add_u64 v0, v[4:5], s[2:3] scope:SCOPE_DEV ; GFX1250-GISEL-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV @@ -1902,6 +1966,8 @@ define amdgpu_ps void @flat_add_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX1250-SDAG-NEXT: .LBB21_2: ; %atomicrmw.phi ; GFX1250-SDAG-NEXT: s_endpgm ; GFX1250-SDAG-NEXT: .LBB21_3: ; %atomicrmw.global +; GFX1250-SDAG-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0 ; GFX1250-SDAG-NEXT: flat_atomic_add_u64 v[0:1], v[2:3] scope:SCOPE_DEV ; GFX1250-SDAG-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: global_inv scope:SCOPE_DEV @@ -1945,6 +2011,8 @@ define amdgpu_ps void @flat_add_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX1250-GISEL-NEXT: .LBB21_2: ; %atomicrmw.phi ; GFX1250-GISEL-NEXT: s_endpgm ; GFX1250-GISEL-NEXT: .LBB21_3: ; %atomicrmw.global +; GFX1250-GISEL-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0 ; GFX1250-GISEL-NEXT: flat_atomic_add_u64 v0, v[4:5], s[2:3] offset:-128 scope:SCOPE_DEV ; GFX1250-GISEL-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV @@ -2058,6 +2126,8 @@ define amdgpu_ps void @flat_add_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo define amdgpu_ps float @flat_sub_saddr_i32_rtn(ptr inreg %sbase, i32 %voffset, i32 %data) { ; GFX1250-LABEL: flat_sub_saddr_i32_rtn: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: flat_atomic_sub_u32 v0, v0, v1, s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV @@ -2096,6 +2166,8 @@ define amdgpu_ps float @flat_sub_saddr_i32_rtn(ptr inreg %sbase, i32 %voffset, i define amdgpu_ps float @flat_sub_saddr_i32_rtn_neg128(ptr inreg %sbase, i32 %voffset, i32 %data) { ; GFX1250-LABEL: flat_sub_saddr_i32_rtn_neg128: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: flat_atomic_sub_u32 v0, v0, v1, s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV @@ -2141,6 +2213,8 @@ define amdgpu_ps float @flat_sub_saddr_i32_rtn_neg128(ptr inreg %sbase, i32 %vof define amdgpu_ps void @flat_sub_saddr_i32_nortn(ptr inreg %sbase, i32 %voffset, i32 %data) { ; GFX1250-LABEL: flat_sub_saddr_i32_nortn: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: flat_atomic_sub_u32 v0, v1, s[2:3] scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV @@ -2177,6 +2251,8 @@ define amdgpu_ps void @flat_sub_saddr_i32_nortn(ptr inreg %sbase, i32 %voffset, define amdgpu_ps void @flat_sub_saddr_i32_nortn_neg128(ptr inreg %sbase, i32 %voffset, i32 %data) { ; GFX1250-LABEL: flat_sub_saddr_i32_nortn_neg128: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: flat_atomic_sub_u32 v0, v1, s[2:3] offset:-128 scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV @@ -2240,6 +2316,8 @@ define amdgpu_ps <2 x float> @flat_sub_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX1250-SDAG-NEXT: s_branch .LBB26_5 ; GFX1250-SDAG-NEXT: .LBB26_3: ; %atomicrmw.global +; GFX1250-SDAG-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0 ; GFX1250-SDAG-NEXT: flat_atomic_sub_u64 v[0:1], v[4:5], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: global_inv scope:SCOPE_DEV @@ -2286,6 +2364,8 @@ define amdgpu_ps <2 x float> @flat_sub_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX1250-GISEL-NEXT: s_branch .LBB26_5 ; GFX1250-GISEL-NEXT: .LBB26_3: ; %atomicrmw.global +; GFX1250-GISEL-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0 ; GFX1250-GISEL-NEXT: flat_atomic_sub_u64 v[0:1], v3, v[4:5], s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV @@ -2426,6 +2506,8 @@ define amdgpu_ps <2 x float> @flat_sub_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX1250-SDAG-NEXT: s_branch .LBB27_5 ; GFX1250-SDAG-NEXT: .LBB27_3: ; %atomicrmw.global +; GFX1250-SDAG-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0 ; GFX1250-SDAG-NEXT: flat_atomic_sub_u64 v[0:1], v[4:5], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: global_inv scope:SCOPE_DEV @@ -2475,6 +2557,8 @@ define amdgpu_ps <2 x float> @flat_sub_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX1250-GISEL-NEXT: s_branch .LBB27_5 ; GFX1250-GISEL-NEXT: .LBB27_3: ; %atomicrmw.global +; GFX1250-GISEL-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0 ; GFX1250-GISEL-NEXT: flat_atomic_sub_u64 v[0:1], v3, v[4:5], s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV @@ -2616,6 +2700,8 @@ define amdgpu_ps void @flat_sub_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-SDAG-NEXT: .LBB28_2: ; %atomicrmw.phi ; GFX1250-SDAG-NEXT: s_endpgm ; GFX1250-SDAG-NEXT: .LBB28_3: ; %atomicrmw.global +; GFX1250-SDAG-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0 ; GFX1250-SDAG-NEXT: flat_atomic_sub_u64 v[0:1], v[2:3] scope:SCOPE_DEV ; GFX1250-SDAG-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: global_inv scope:SCOPE_DEV @@ -2656,6 +2742,8 @@ define amdgpu_ps void @flat_sub_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-GISEL-NEXT: .LBB28_2: ; %atomicrmw.phi ; GFX1250-GISEL-NEXT: s_endpgm ; GFX1250-GISEL-NEXT: .LBB28_3: ; %atomicrmw.global +; GFX1250-GISEL-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0 ; GFX1250-GISEL-NEXT: flat_atomic_sub_u64 v0, v[4:5], s[2:3] scope:SCOPE_DEV ; GFX1250-GISEL-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV @@ -2780,6 +2868,8 @@ define amdgpu_ps void @flat_sub_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX1250-SDAG-NEXT: .LBB29_2: ; %atomicrmw.phi ; GFX1250-SDAG-NEXT: s_endpgm ; GFX1250-SDAG-NEXT: .LBB29_3: ; %atomicrmw.global +; GFX1250-SDAG-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0 ; GFX1250-SDAG-NEXT: flat_atomic_sub_u64 v[0:1], v[2:3] scope:SCOPE_DEV ; GFX1250-SDAG-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: global_inv scope:SCOPE_DEV @@ -2823,6 +2913,8 @@ define amdgpu_ps void @flat_sub_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX1250-GISEL-NEXT: .LBB29_2: ; %atomicrmw.phi ; GFX1250-GISEL-NEXT: s_endpgm ; GFX1250-GISEL-NEXT: .LBB29_3: ; %atomicrmw.global +; GFX1250-GISEL-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0 ; GFX1250-GISEL-NEXT: flat_atomic_sub_u64 v0, v[4:5], s[2:3] offset:-128 scope:SCOPE_DEV ; GFX1250-GISEL-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV @@ -2938,6 +3030,8 @@ define amdgpu_ps void @flat_sub_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo define amdgpu_ps float @flat_and_saddr_i32_rtn(ptr inreg %sbase, i32 %voffset, i32 %data) { ; GFX1250-LABEL: flat_and_saddr_i32_rtn: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: flat_atomic_and_b32 v0, v0, v1, s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV @@ -2976,6 +3070,8 @@ define amdgpu_ps float @flat_and_saddr_i32_rtn(ptr inreg %sbase, i32 %voffset, i define amdgpu_ps float @flat_and_saddr_i32_rtn_neg128(ptr inreg %sbase, i32 %voffset, i32 %data) { ; GFX1250-LABEL: flat_and_saddr_i32_rtn_neg128: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: flat_atomic_and_b32 v0, v0, v1, s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV @@ -3021,6 +3117,8 @@ define amdgpu_ps float @flat_and_saddr_i32_rtn_neg128(ptr inreg %sbase, i32 %vof define amdgpu_ps void @flat_and_saddr_i32_nortn(ptr inreg %sbase, i32 %voffset, i32 %data) { ; GFX1250-LABEL: flat_and_saddr_i32_nortn: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: flat_atomic_and_b32 v0, v1, s[2:3] scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV @@ -3057,6 +3155,8 @@ define amdgpu_ps void @flat_and_saddr_i32_nortn(ptr inreg %sbase, i32 %voffset, define amdgpu_ps void @flat_and_saddr_i32_nortn_neg128(ptr inreg %sbase, i32 %voffset, i32 %data) { ; GFX1250-LABEL: flat_and_saddr_i32_nortn_neg128: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: flat_atomic_and_b32 v0, v1, s[2:3] offset:-128 scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV @@ -3120,6 +3220,8 @@ define amdgpu_ps <2 x float> @flat_and_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX1250-SDAG-NEXT: s_branch .LBB34_5 ; GFX1250-SDAG-NEXT: .LBB34_3: ; %atomicrmw.global +; GFX1250-SDAG-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0 ; GFX1250-SDAG-NEXT: flat_atomic_and_b64 v[0:1], v[4:5], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: global_inv scope:SCOPE_DEV @@ -3167,6 +3269,8 @@ define amdgpu_ps <2 x float> @flat_and_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX1250-GISEL-NEXT: s_branch .LBB34_5 ; GFX1250-GISEL-NEXT: .LBB34_3: ; %atomicrmw.global +; GFX1250-GISEL-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0 ; GFX1250-GISEL-NEXT: flat_atomic_and_b64 v[0:1], v3, v[4:5], s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV @@ -3306,6 +3410,8 @@ define amdgpu_ps <2 x float> @flat_and_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX1250-SDAG-NEXT: s_branch .LBB35_5 ; GFX1250-SDAG-NEXT: .LBB35_3: ; %atomicrmw.global +; GFX1250-SDAG-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0 ; GFX1250-SDAG-NEXT: flat_atomic_and_b64 v[0:1], v[4:5], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: global_inv scope:SCOPE_DEV @@ -3356,6 +3462,8 @@ define amdgpu_ps <2 x float> @flat_and_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX1250-GISEL-NEXT: s_branch .LBB35_5 ; GFX1250-GISEL-NEXT: .LBB35_3: ; %atomicrmw.global +; GFX1250-GISEL-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0 ; GFX1250-GISEL-NEXT: flat_atomic_and_b64 v[0:1], v3, v[4:5], s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV @@ -3496,6 +3604,8 @@ define amdgpu_ps void @flat_and_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-SDAG-NEXT: .LBB36_2: ; %atomicrmw.phi ; GFX1250-SDAG-NEXT: s_endpgm ; GFX1250-SDAG-NEXT: .LBB36_3: ; %atomicrmw.global +; GFX1250-SDAG-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0 ; GFX1250-SDAG-NEXT: flat_atomic_and_b64 v[0:1], v[2:3] scope:SCOPE_DEV ; GFX1250-SDAG-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: global_inv scope:SCOPE_DEV @@ -3537,6 +3647,8 @@ define amdgpu_ps void @flat_and_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-GISEL-NEXT: .LBB36_2: ; %atomicrmw.phi ; GFX1250-GISEL-NEXT: s_endpgm ; GFX1250-GISEL-NEXT: .LBB36_3: ; %atomicrmw.global +; GFX1250-GISEL-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0 ; GFX1250-GISEL-NEXT: flat_atomic_and_b64 v0, v[4:5], s[2:3] scope:SCOPE_DEV ; GFX1250-GISEL-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV @@ -3660,6 +3772,8 @@ define amdgpu_ps void @flat_and_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX1250-SDAG-NEXT: .LBB37_2: ; %atomicrmw.phi ; GFX1250-SDAG-NEXT: s_endpgm ; GFX1250-SDAG-NEXT: .LBB37_3: ; %atomicrmw.global +; GFX1250-SDAG-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0 ; GFX1250-SDAG-NEXT: flat_atomic_and_b64 v[0:1], v[2:3] scope:SCOPE_DEV ; GFX1250-SDAG-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: global_inv scope:SCOPE_DEV @@ -3704,6 +3818,8 @@ define amdgpu_ps void @flat_and_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX1250-GISEL-NEXT: .LBB37_2: ; %atomicrmw.phi ; GFX1250-GISEL-NEXT: s_endpgm ; GFX1250-GISEL-NEXT: .LBB37_3: ; %atomicrmw.global +; GFX1250-GISEL-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0 ; GFX1250-GISEL-NEXT: flat_atomic_and_b64 v0, v[4:5], s[2:3] offset:-128 scope:SCOPE_DEV ; GFX1250-GISEL-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV @@ -3818,6 +3934,8 @@ define amdgpu_ps void @flat_and_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo define amdgpu_ps float @flat_or_saddr_i32_rtn(ptr inreg %sbase, i32 %voffset, i32 %data) { ; GFX1250-LABEL: flat_or_saddr_i32_rtn: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: flat_atomic_or_b32 v0, v0, v1, s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV @@ -3856,6 +3974,8 @@ define amdgpu_ps float @flat_or_saddr_i32_rtn(ptr inreg %sbase, i32 %voffset, i3 define amdgpu_ps float @flat_or_saddr_i32_rtn_neg128(ptr inreg %sbase, i32 %voffset, i32 %data) { ; GFX1250-LABEL: flat_or_saddr_i32_rtn_neg128: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: flat_atomic_or_b32 v0, v0, v1, s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV @@ -3901,6 +4021,8 @@ define amdgpu_ps float @flat_or_saddr_i32_rtn_neg128(ptr inreg %sbase, i32 %voff define amdgpu_ps void @flat_or_saddr_i32_nortn(ptr inreg %sbase, i32 %voffset, i32 %data) { ; GFX1250-LABEL: flat_or_saddr_i32_nortn: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: flat_atomic_or_b32 v0, v1, s[2:3] scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV @@ -3937,6 +4059,8 @@ define amdgpu_ps void @flat_or_saddr_i32_nortn(ptr inreg %sbase, i32 %voffset, i define amdgpu_ps void @flat_or_saddr_i32_nortn_neg128(ptr inreg %sbase, i32 %voffset, i32 %data) { ; GFX1250-LABEL: flat_or_saddr_i32_nortn_neg128: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: flat_atomic_or_b32 v0, v1, s[2:3] offset:-128 scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV @@ -4000,6 +4124,8 @@ define amdgpu_ps <2 x float> @flat_or_saddr_i64_rtn(ptr inreg %sbase, i32 %voffs ; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX1250-SDAG-NEXT: s_branch .LBB42_5 ; GFX1250-SDAG-NEXT: .LBB42_3: ; %atomicrmw.global +; GFX1250-SDAG-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0 ; GFX1250-SDAG-NEXT: flat_atomic_or_b64 v[0:1], v[4:5], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: global_inv scope:SCOPE_DEV @@ -4047,6 +4173,8 @@ define amdgpu_ps <2 x float> @flat_or_saddr_i64_rtn(ptr inreg %sbase, i32 %voffs ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX1250-GISEL-NEXT: s_branch .LBB42_5 ; GFX1250-GISEL-NEXT: .LBB42_3: ; %atomicrmw.global +; GFX1250-GISEL-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0 ; GFX1250-GISEL-NEXT: flat_atomic_or_b64 v[0:1], v3, v[4:5], s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV @@ -4186,6 +4314,8 @@ define amdgpu_ps <2 x float> @flat_or_saddr_i64_rtn_neg128(ptr inreg %sbase, i32 ; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX1250-SDAG-NEXT: s_branch .LBB43_5 ; GFX1250-SDAG-NEXT: .LBB43_3: ; %atomicrmw.global +; GFX1250-SDAG-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0 ; GFX1250-SDAG-NEXT: flat_atomic_or_b64 v[0:1], v[4:5], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: global_inv scope:SCOPE_DEV @@ -4236,6 +4366,8 @@ define amdgpu_ps <2 x float> @flat_or_saddr_i64_rtn_neg128(ptr inreg %sbase, i32 ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX1250-GISEL-NEXT: s_branch .LBB43_5 ; GFX1250-GISEL-NEXT: .LBB43_3: ; %atomicrmw.global +; GFX1250-GISEL-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0 ; GFX1250-GISEL-NEXT: flat_atomic_or_b64 v[0:1], v3, v[4:5], s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV @@ -4376,6 +4508,8 @@ define amdgpu_ps void @flat_or_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, i ; GFX1250-SDAG-NEXT: .LBB44_2: ; %atomicrmw.phi ; GFX1250-SDAG-NEXT: s_endpgm ; GFX1250-SDAG-NEXT: .LBB44_3: ; %atomicrmw.global +; GFX1250-SDAG-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0 ; GFX1250-SDAG-NEXT: flat_atomic_or_b64 v[0:1], v[2:3] scope:SCOPE_DEV ; GFX1250-SDAG-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: global_inv scope:SCOPE_DEV @@ -4417,6 +4551,8 @@ define amdgpu_ps void @flat_or_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, i ; GFX1250-GISEL-NEXT: .LBB44_2: ; %atomicrmw.phi ; GFX1250-GISEL-NEXT: s_endpgm ; GFX1250-GISEL-NEXT: .LBB44_3: ; %atomicrmw.global +; GFX1250-GISEL-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0 ; GFX1250-GISEL-NEXT: flat_atomic_or_b64 v0, v[4:5], s[2:3] scope:SCOPE_DEV ; GFX1250-GISEL-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV @@ -4540,6 +4676,8 @@ define amdgpu_ps void @flat_or_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vof ; GFX1250-SDAG-NEXT: .LBB45_2: ; %atomicrmw.phi ; GFX1250-SDAG-NEXT: s_endpgm ; GFX1250-SDAG-NEXT: .LBB45_3: ; %atomicrmw.global +; GFX1250-SDAG-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0 ; GFX1250-SDAG-NEXT: flat_atomic_or_b64 v[0:1], v[2:3] scope:SCOPE_DEV ; GFX1250-SDAG-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: global_inv scope:SCOPE_DEV @@ -4584,6 +4722,8 @@ define amdgpu_ps void @flat_or_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vof ; GFX1250-GISEL-NEXT: .LBB45_2: ; %atomicrmw.phi ; GFX1250-GISEL-NEXT: s_endpgm ; GFX1250-GISEL-NEXT: .LBB45_3: ; %atomicrmw.global +; GFX1250-GISEL-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0 ; GFX1250-GISEL-NEXT: flat_atomic_or_b64 v0, v[4:5], s[2:3] offset:-128 scope:SCOPE_DEV ; GFX1250-GISEL-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV @@ -4698,6 +4838,8 @@ define amdgpu_ps void @flat_or_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vof define amdgpu_ps float @flat_xor_saddr_i32_rtn(ptr inreg %sbase, i32 %voffset, i32 %data) { ; GFX1250-LABEL: flat_xor_saddr_i32_rtn: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: flat_atomic_xor_b32 v0, v0, v1, s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV @@ -4736,6 +4878,8 @@ define amdgpu_ps float @flat_xor_saddr_i32_rtn(ptr inreg %sbase, i32 %voffset, i define amdgpu_ps float @flat_xor_saddr_i32_rtn_neg128(ptr inreg %sbase, i32 %voffset, i32 %data) { ; GFX1250-LABEL: flat_xor_saddr_i32_rtn_neg128: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: flat_atomic_xor_b32 v0, v0, v1, s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV @@ -4781,6 +4925,8 @@ define amdgpu_ps float @flat_xor_saddr_i32_rtn_neg128(ptr inreg %sbase, i32 %vof define amdgpu_ps void @flat_xor_saddr_i32_nortn(ptr inreg %sbase, i32 %voffset, i32 %data) { ; GFX1250-LABEL: flat_xor_saddr_i32_nortn: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: flat_atomic_xor_b32 v0, v1, s[2:3] scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV @@ -4817,6 +4963,8 @@ define amdgpu_ps void @flat_xor_saddr_i32_nortn(ptr inreg %sbase, i32 %voffset, define amdgpu_ps void @flat_xor_saddr_i32_nortn_neg128(ptr inreg %sbase, i32 %voffset, i32 %data) { ; GFX1250-LABEL: flat_xor_saddr_i32_nortn_neg128: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: flat_atomic_xor_b32 v0, v1, s[2:3] offset:-128 scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV @@ -4880,6 +5028,8 @@ define amdgpu_ps <2 x float> @flat_xor_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX1250-SDAG-NEXT: s_branch .LBB50_5 ; GFX1250-SDAG-NEXT: .LBB50_3: ; %atomicrmw.global +; GFX1250-SDAG-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0 ; GFX1250-SDAG-NEXT: flat_atomic_xor_b64 v[0:1], v[4:5], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: global_inv scope:SCOPE_DEV @@ -4927,6 +5077,8 @@ define amdgpu_ps <2 x float> @flat_xor_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX1250-GISEL-NEXT: s_branch .LBB50_5 ; GFX1250-GISEL-NEXT: .LBB50_3: ; %atomicrmw.global +; GFX1250-GISEL-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0 ; GFX1250-GISEL-NEXT: flat_atomic_xor_b64 v[0:1], v3, v[4:5], s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV @@ -5066,6 +5218,8 @@ define amdgpu_ps <2 x float> @flat_xor_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX1250-SDAG-NEXT: s_branch .LBB51_5 ; GFX1250-SDAG-NEXT: .LBB51_3: ; %atomicrmw.global +; GFX1250-SDAG-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0 ; GFX1250-SDAG-NEXT: flat_atomic_xor_b64 v[0:1], v[4:5], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: global_inv scope:SCOPE_DEV @@ -5116,6 +5270,8 @@ define amdgpu_ps <2 x float> @flat_xor_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX1250-GISEL-NEXT: s_branch .LBB51_5 ; GFX1250-GISEL-NEXT: .LBB51_3: ; %atomicrmw.global +; GFX1250-GISEL-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0 ; GFX1250-GISEL-NEXT: flat_atomic_xor_b64 v[0:1], v3, v[4:5], s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV @@ -5256,6 +5412,8 @@ define amdgpu_ps void @flat_xor_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-SDAG-NEXT: .LBB52_2: ; %atomicrmw.phi ; GFX1250-SDAG-NEXT: s_endpgm ; GFX1250-SDAG-NEXT: .LBB52_3: ; %atomicrmw.global +; GFX1250-SDAG-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0 ; GFX1250-SDAG-NEXT: flat_atomic_xor_b64 v[0:1], v[2:3] scope:SCOPE_DEV ; GFX1250-SDAG-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: global_inv scope:SCOPE_DEV @@ -5297,6 +5455,8 @@ define amdgpu_ps void @flat_xor_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-GISEL-NEXT: .LBB52_2: ; %atomicrmw.phi ; GFX1250-GISEL-NEXT: s_endpgm ; GFX1250-GISEL-NEXT: .LBB52_3: ; %atomicrmw.global +; GFX1250-GISEL-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0 ; GFX1250-GISEL-NEXT: flat_atomic_xor_b64 v0, v[4:5], s[2:3] scope:SCOPE_DEV ; GFX1250-GISEL-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV @@ -5420,6 +5580,8 @@ define amdgpu_ps void @flat_xor_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX1250-SDAG-NEXT: .LBB53_2: ; %atomicrmw.phi ; GFX1250-SDAG-NEXT: s_endpgm ; GFX1250-SDAG-NEXT: .LBB53_3: ; %atomicrmw.global +; GFX1250-SDAG-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0 ; GFX1250-SDAG-NEXT: flat_atomic_xor_b64 v[0:1], v[2:3] scope:SCOPE_DEV ; GFX1250-SDAG-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: global_inv scope:SCOPE_DEV @@ -5464,6 +5626,8 @@ define amdgpu_ps void @flat_xor_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX1250-GISEL-NEXT: .LBB53_2: ; %atomicrmw.phi ; GFX1250-GISEL-NEXT: s_endpgm ; GFX1250-GISEL-NEXT: .LBB53_3: ; %atomicrmw.global +; GFX1250-GISEL-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0 ; GFX1250-GISEL-NEXT: flat_atomic_xor_b64 v0, v[4:5], s[2:3] offset:-128 scope:SCOPE_DEV ; GFX1250-GISEL-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV @@ -5650,7 +5814,7 @@ define amdgpu_ps void @flat_max_saddr_i32_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-LABEL: flat_max_saddr_i32_nortn: ; GFX1250: ; %bb.0: ; GFX1250-NEXT: flat_atomic_max_i32 v0, v1, s[2:3] -; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-NEXT: s_endpgm ; ; GFX950-SDAG-LABEL: flat_max_saddr_i32_nortn: @@ -5681,7 +5845,7 @@ define amdgpu_ps void @flat_max_saddr_i32_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX1250-LABEL: flat_max_saddr_i32_nortn_neg128: ; GFX1250: ; %bb.0: ; GFX1250-NEXT: flat_atomic_max_i32 v0, v1, s[2:3] offset:-128 -; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-NEXT: s_endpgm ; ; GFX950-SDAG-LABEL: flat_max_saddr_i32_nortn_neg128: @@ -5735,20 +5899,17 @@ define amdgpu_ps <2 x float> @flat_max_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB58_4 ; GFX1250-SDAG-NEXT: .LBB58_2: ; %atomicrmw.phi ; GFX1250-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX1250-SDAG-NEXT: s_branch .LBB58_5 ; GFX1250-SDAG-NEXT: .LBB58_3: ; %atomicrmw.global ; GFX1250-SDAG-NEXT: flat_atomic_max_i64 v[0:1], v[4:5], v[2:3] th:TH_ATOMIC_RETURN -; GFX1250-SDAG-NEXT: s_wait_dscnt 0x0 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 +; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB58_2 ; GFX1250-SDAG-NEXT: .LBB58_4: ; %atomicrmw.private ; GFX1250-SDAG-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] -; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, s1, v4 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo @@ -5782,20 +5943,17 @@ define amdgpu_ps <2 x float> @flat_max_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB58_4 ; GFX1250-GISEL-NEXT: .LBB58_2: ; %atomicrmw.phi ; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX1250-GISEL-NEXT: s_branch .LBB58_5 ; GFX1250-GISEL-NEXT: .LBB58_3: ; %atomicrmw.global ; GFX1250-GISEL-NEXT: flat_atomic_max_i64 v[0:1], v3, v[4:5], s[2:3] th:TH_ATOMIC_RETURN -; GFX1250-GISEL-NEXT: s_wait_dscnt 0x0 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 -; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 +; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB58_2 ; GFX1250-GISEL-NEXT: .LBB58_4: ; %atomicrmw.private ; GFX1250-GISEL-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] -; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s1, v6 ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo @@ -5923,20 +6081,17 @@ define amdgpu_ps <2 x float> @flat_max_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB59_4 ; GFX1250-SDAG-NEXT: .LBB59_2: ; %atomicrmw.phi ; GFX1250-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX1250-SDAG-NEXT: s_branch .LBB59_5 ; GFX1250-SDAG-NEXT: .LBB59_3: ; %atomicrmw.global ; GFX1250-SDAG-NEXT: flat_atomic_max_i64 v[0:1], v[4:5], v[2:3] th:TH_ATOMIC_RETURN -; GFX1250-SDAG-NEXT: s_wait_dscnt 0x0 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 +; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB59_2 ; GFX1250-SDAG-NEXT: .LBB59_4: ; %atomicrmw.private ; GFX1250-SDAG-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] -; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, s1, v4 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo @@ -5973,20 +6128,17 @@ define amdgpu_ps <2 x float> @flat_max_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB59_4 ; GFX1250-GISEL-NEXT: .LBB59_2: ; %atomicrmw.phi ; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX1250-GISEL-NEXT: s_branch .LBB59_5 ; GFX1250-GISEL-NEXT: .LBB59_3: ; %atomicrmw.global ; GFX1250-GISEL-NEXT: flat_atomic_max_i64 v[0:1], v3, v[4:5], s[2:3] offset:-128 th:TH_ATOMIC_RETURN -; GFX1250-GISEL-NEXT: s_wait_dscnt 0x0 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 -; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 +; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB59_2 ; GFX1250-GISEL-NEXT: .LBB59_4: ; %atomicrmw.private ; GFX1250-GISEL-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] -; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s1, v6 ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo @@ -6119,9 +6271,9 @@ define amdgpu_ps void @flat_max_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-SDAG-NEXT: s_endpgm ; GFX1250-SDAG-NEXT: .LBB60_3: ; %atomicrmw.global ; GFX1250-SDAG-NEXT: flat_atomic_max_i64 v[0:1], v[2:3] -; GFX1250-SDAG-NEXT: s_wait_dscnt 0x0 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX1250-SDAG-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB60_2 @@ -6158,9 +6310,9 @@ define amdgpu_ps void @flat_max_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-GISEL-NEXT: s_endpgm ; GFX1250-GISEL-NEXT: .LBB60_3: ; %atomicrmw.global ; GFX1250-GISEL-NEXT: flat_atomic_max_i64 v0, v[4:5], s[2:3] -; GFX1250-GISEL-NEXT: s_wait_dscnt 0x0 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX1250-GISEL-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB60_2 @@ -6279,9 +6431,9 @@ define amdgpu_ps void @flat_max_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX1250-SDAG-NEXT: s_endpgm ; GFX1250-SDAG-NEXT: .LBB61_3: ; %atomicrmw.global ; GFX1250-SDAG-NEXT: flat_atomic_max_i64 v[0:1], v[2:3] -; GFX1250-SDAG-NEXT: s_wait_dscnt 0x0 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX1250-SDAG-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB61_2 @@ -6321,9 +6473,9 @@ define amdgpu_ps void @flat_max_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX1250-GISEL-NEXT: s_endpgm ; GFX1250-GISEL-NEXT: .LBB61_3: ; %atomicrmw.global ; GFX1250-GISEL-NEXT: flat_atomic_max_i64 v0, v[4:5], s[2:3] offset:-128 -; GFX1250-GISEL-NEXT: s_wait_dscnt 0x0 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX1250-GISEL-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB61_2 @@ -6504,7 +6656,7 @@ define amdgpu_ps void @flat_min_saddr_i32_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-LABEL: flat_min_saddr_i32_nortn: ; GFX1250: ; %bb.0: ; GFX1250-NEXT: flat_atomic_min_i32 v0, v1, s[2:3] -; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-NEXT: s_endpgm ; ; GFX950-SDAG-LABEL: flat_min_saddr_i32_nortn: @@ -6535,7 +6687,7 @@ define amdgpu_ps void @flat_min_saddr_i32_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX1250-LABEL: flat_min_saddr_i32_nortn_neg128: ; GFX1250: ; %bb.0: ; GFX1250-NEXT: flat_atomic_min_i32 v0, v1, s[2:3] offset:-128 -; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-NEXT: s_endpgm ; ; GFX950-SDAG-LABEL: flat_min_saddr_i32_nortn_neg128: @@ -6589,20 +6741,17 @@ define amdgpu_ps <2 x float> @flat_min_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB66_4 ; GFX1250-SDAG-NEXT: .LBB66_2: ; %atomicrmw.phi ; GFX1250-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX1250-SDAG-NEXT: s_branch .LBB66_5 ; GFX1250-SDAG-NEXT: .LBB66_3: ; %atomicrmw.global ; GFX1250-SDAG-NEXT: flat_atomic_min_i64 v[0:1], v[4:5], v[2:3] th:TH_ATOMIC_RETURN -; GFX1250-SDAG-NEXT: s_wait_dscnt 0x0 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 +; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB66_2 ; GFX1250-SDAG-NEXT: .LBB66_4: ; %atomicrmw.private ; GFX1250-SDAG-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] -; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, s1, v4 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo @@ -6636,20 +6785,17 @@ define amdgpu_ps <2 x float> @flat_min_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB66_4 ; GFX1250-GISEL-NEXT: .LBB66_2: ; %atomicrmw.phi ; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX1250-GISEL-NEXT: s_branch .LBB66_5 ; GFX1250-GISEL-NEXT: .LBB66_3: ; %atomicrmw.global ; GFX1250-GISEL-NEXT: flat_atomic_min_i64 v[0:1], v3, v[4:5], s[2:3] th:TH_ATOMIC_RETURN -; GFX1250-GISEL-NEXT: s_wait_dscnt 0x0 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 -; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 +; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB66_2 ; GFX1250-GISEL-NEXT: .LBB66_4: ; %atomicrmw.private ; GFX1250-GISEL-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] -; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s1, v6 ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo @@ -6777,20 +6923,17 @@ define amdgpu_ps <2 x float> @flat_min_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB67_4 ; GFX1250-SDAG-NEXT: .LBB67_2: ; %atomicrmw.phi ; GFX1250-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX1250-SDAG-NEXT: s_branch .LBB67_5 ; GFX1250-SDAG-NEXT: .LBB67_3: ; %atomicrmw.global ; GFX1250-SDAG-NEXT: flat_atomic_min_i64 v[0:1], v[4:5], v[2:3] th:TH_ATOMIC_RETURN -; GFX1250-SDAG-NEXT: s_wait_dscnt 0x0 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 +; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB67_2 ; GFX1250-SDAG-NEXT: .LBB67_4: ; %atomicrmw.private ; GFX1250-SDAG-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] -; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, s1, v4 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo @@ -6827,20 +6970,17 @@ define amdgpu_ps <2 x float> @flat_min_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB67_4 ; GFX1250-GISEL-NEXT: .LBB67_2: ; %atomicrmw.phi ; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX1250-GISEL-NEXT: s_branch .LBB67_5 ; GFX1250-GISEL-NEXT: .LBB67_3: ; %atomicrmw.global ; GFX1250-GISEL-NEXT: flat_atomic_min_i64 v[0:1], v3, v[4:5], s[2:3] offset:-128 th:TH_ATOMIC_RETURN -; GFX1250-GISEL-NEXT: s_wait_dscnt 0x0 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 -; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 +; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB67_2 ; GFX1250-GISEL-NEXT: .LBB67_4: ; %atomicrmw.private ; GFX1250-GISEL-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] -; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s1, v6 ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo @@ -6973,9 +7113,9 @@ define amdgpu_ps void @flat_min_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-SDAG-NEXT: s_endpgm ; GFX1250-SDAG-NEXT: .LBB68_3: ; %atomicrmw.global ; GFX1250-SDAG-NEXT: flat_atomic_min_i64 v[0:1], v[2:3] -; GFX1250-SDAG-NEXT: s_wait_dscnt 0x0 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX1250-SDAG-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB68_2 @@ -7012,9 +7152,9 @@ define amdgpu_ps void @flat_min_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-GISEL-NEXT: s_endpgm ; GFX1250-GISEL-NEXT: .LBB68_3: ; %atomicrmw.global ; GFX1250-GISEL-NEXT: flat_atomic_min_i64 v0, v[4:5], s[2:3] -; GFX1250-GISEL-NEXT: s_wait_dscnt 0x0 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX1250-GISEL-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB68_2 @@ -7133,9 +7273,9 @@ define amdgpu_ps void @flat_min_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX1250-SDAG-NEXT: s_endpgm ; GFX1250-SDAG-NEXT: .LBB69_3: ; %atomicrmw.global ; GFX1250-SDAG-NEXT: flat_atomic_min_i64 v[0:1], v[2:3] -; GFX1250-SDAG-NEXT: s_wait_dscnt 0x0 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX1250-SDAG-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB69_2 @@ -7175,9 +7315,9 @@ define amdgpu_ps void @flat_min_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX1250-GISEL-NEXT: s_endpgm ; GFX1250-GISEL-NEXT: .LBB69_3: ; %atomicrmw.global ; GFX1250-GISEL-NEXT: flat_atomic_min_i64 v0, v[4:5], s[2:3] offset:-128 -; GFX1250-GISEL-NEXT: s_wait_dscnt 0x0 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX1250-GISEL-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB69_2 @@ -7358,7 +7498,7 @@ define amdgpu_ps void @flat_umax_saddr_i32_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-LABEL: flat_umax_saddr_i32_nortn: ; GFX1250: ; %bb.0: ; GFX1250-NEXT: flat_atomic_max_u32 v0, v1, s[2:3] -; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-NEXT: s_endpgm ; ; GFX950-SDAG-LABEL: flat_umax_saddr_i32_nortn: @@ -7389,7 +7529,7 @@ define amdgpu_ps void @flat_umax_saddr_i32_nortn_neg128(ptr inreg %sbase, i32 %v ; GFX1250-LABEL: flat_umax_saddr_i32_nortn_neg128: ; GFX1250: ; %bb.0: ; GFX1250-NEXT: flat_atomic_max_u32 v0, v1, s[2:3] offset:-128 -; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-NEXT: s_endpgm ; ; GFX950-SDAG-LABEL: flat_umax_saddr_i32_nortn_neg128: @@ -7443,20 +7583,17 @@ define amdgpu_ps <2 x float> @flat_umax_saddr_i64_rtn(ptr inreg %sbase, i32 %vof ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB74_4 ; GFX1250-SDAG-NEXT: .LBB74_2: ; %atomicrmw.phi ; GFX1250-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX1250-SDAG-NEXT: s_branch .LBB74_5 ; GFX1250-SDAG-NEXT: .LBB74_3: ; %atomicrmw.global ; GFX1250-SDAG-NEXT: flat_atomic_max_u64 v[0:1], v[4:5], v[2:3] th:TH_ATOMIC_RETURN -; GFX1250-SDAG-NEXT: s_wait_dscnt 0x0 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 +; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB74_2 ; GFX1250-SDAG-NEXT: .LBB74_4: ; %atomicrmw.private ; GFX1250-SDAG-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] -; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, s1, v4 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo @@ -7490,20 +7627,17 @@ define amdgpu_ps <2 x float> @flat_umax_saddr_i64_rtn(ptr inreg %sbase, i32 %vof ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB74_4 ; GFX1250-GISEL-NEXT: .LBB74_2: ; %atomicrmw.phi ; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX1250-GISEL-NEXT: s_branch .LBB74_5 ; GFX1250-GISEL-NEXT: .LBB74_3: ; %atomicrmw.global ; GFX1250-GISEL-NEXT: flat_atomic_max_u64 v[0:1], v3, v[4:5], s[2:3] th:TH_ATOMIC_RETURN -; GFX1250-GISEL-NEXT: s_wait_dscnt 0x0 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 -; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 +; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB74_2 ; GFX1250-GISEL-NEXT: .LBB74_4: ; %atomicrmw.private ; GFX1250-GISEL-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] -; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s1, v6 ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo @@ -7631,20 +7765,17 @@ define amdgpu_ps <2 x float> @flat_umax_saddr_i64_rtn_neg128(ptr inreg %sbase, i ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB75_4 ; GFX1250-SDAG-NEXT: .LBB75_2: ; %atomicrmw.phi ; GFX1250-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX1250-SDAG-NEXT: s_branch .LBB75_5 ; GFX1250-SDAG-NEXT: .LBB75_3: ; %atomicrmw.global ; GFX1250-SDAG-NEXT: flat_atomic_max_u64 v[0:1], v[4:5], v[2:3] th:TH_ATOMIC_RETURN -; GFX1250-SDAG-NEXT: s_wait_dscnt 0x0 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 +; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB75_2 ; GFX1250-SDAG-NEXT: .LBB75_4: ; %atomicrmw.private ; GFX1250-SDAG-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] -; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, s1, v4 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo @@ -7681,20 +7812,17 @@ define amdgpu_ps <2 x float> @flat_umax_saddr_i64_rtn_neg128(ptr inreg %sbase, i ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB75_4 ; GFX1250-GISEL-NEXT: .LBB75_2: ; %atomicrmw.phi ; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX1250-GISEL-NEXT: s_branch .LBB75_5 ; GFX1250-GISEL-NEXT: .LBB75_3: ; %atomicrmw.global ; GFX1250-GISEL-NEXT: flat_atomic_max_u64 v[0:1], v3, v[4:5], s[2:3] offset:-128 th:TH_ATOMIC_RETURN -; GFX1250-GISEL-NEXT: s_wait_dscnt 0x0 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 -; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 +; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB75_2 ; GFX1250-GISEL-NEXT: .LBB75_4: ; %atomicrmw.private ; GFX1250-GISEL-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] -; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s1, v6 ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo @@ -7827,9 +7955,9 @@ define amdgpu_ps void @flat_umax_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-SDAG-NEXT: s_endpgm ; GFX1250-SDAG-NEXT: .LBB76_3: ; %atomicrmw.global ; GFX1250-SDAG-NEXT: flat_atomic_max_u64 v[0:1], v[2:3] -; GFX1250-SDAG-NEXT: s_wait_dscnt 0x0 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX1250-SDAG-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB76_2 @@ -7866,9 +7994,9 @@ define amdgpu_ps void @flat_umax_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-GISEL-NEXT: s_endpgm ; GFX1250-GISEL-NEXT: .LBB76_3: ; %atomicrmw.global ; GFX1250-GISEL-NEXT: flat_atomic_max_u64 v0, v[4:5], s[2:3] -; GFX1250-GISEL-NEXT: s_wait_dscnt 0x0 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX1250-GISEL-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB76_2 @@ -7987,9 +8115,9 @@ define amdgpu_ps void @flat_umax_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %v ; GFX1250-SDAG-NEXT: s_endpgm ; GFX1250-SDAG-NEXT: .LBB77_3: ; %atomicrmw.global ; GFX1250-SDAG-NEXT: flat_atomic_max_u64 v[0:1], v[2:3] -; GFX1250-SDAG-NEXT: s_wait_dscnt 0x0 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX1250-SDAG-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB77_2 @@ -8029,9 +8157,9 @@ define amdgpu_ps void @flat_umax_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %v ; GFX1250-GISEL-NEXT: s_endpgm ; GFX1250-GISEL-NEXT: .LBB77_3: ; %atomicrmw.global ; GFX1250-GISEL-NEXT: flat_atomic_max_u64 v0, v[4:5], s[2:3] offset:-128 -; GFX1250-GISEL-NEXT: s_wait_dscnt 0x0 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX1250-GISEL-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB77_2 @@ -8212,7 +8340,7 @@ define amdgpu_ps void @flat_umin_saddr_i32_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-LABEL: flat_umin_saddr_i32_nortn: ; GFX1250: ; %bb.0: ; GFX1250-NEXT: flat_atomic_min_u32 v0, v1, s[2:3] -; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-NEXT: s_endpgm ; ; GFX950-SDAG-LABEL: flat_umin_saddr_i32_nortn: @@ -8243,7 +8371,7 @@ define amdgpu_ps void @flat_umin_saddr_i32_nortn_neg128(ptr inreg %sbase, i32 %v ; GFX1250-LABEL: flat_umin_saddr_i32_nortn_neg128: ; GFX1250: ; %bb.0: ; GFX1250-NEXT: flat_atomic_min_u32 v0, v1, s[2:3] offset:-128 -; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-NEXT: s_endpgm ; ; GFX950-SDAG-LABEL: flat_umin_saddr_i32_nortn_neg128: @@ -8297,20 +8425,17 @@ define amdgpu_ps <2 x float> @flat_umin_saddr_i64_rtn(ptr inreg %sbase, i32 %vof ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB82_4 ; GFX1250-SDAG-NEXT: .LBB82_2: ; %atomicrmw.phi ; GFX1250-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX1250-SDAG-NEXT: s_branch .LBB82_5 ; GFX1250-SDAG-NEXT: .LBB82_3: ; %atomicrmw.global ; GFX1250-SDAG-NEXT: flat_atomic_min_u64 v[0:1], v[4:5], v[2:3] th:TH_ATOMIC_RETURN -; GFX1250-SDAG-NEXT: s_wait_dscnt 0x0 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 +; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB82_2 ; GFX1250-SDAG-NEXT: .LBB82_4: ; %atomicrmw.private ; GFX1250-SDAG-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] -; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, s1, v4 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo @@ -8344,20 +8469,17 @@ define amdgpu_ps <2 x float> @flat_umin_saddr_i64_rtn(ptr inreg %sbase, i32 %vof ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB82_4 ; GFX1250-GISEL-NEXT: .LBB82_2: ; %atomicrmw.phi ; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX1250-GISEL-NEXT: s_branch .LBB82_5 ; GFX1250-GISEL-NEXT: .LBB82_3: ; %atomicrmw.global ; GFX1250-GISEL-NEXT: flat_atomic_min_u64 v[0:1], v3, v[4:5], s[2:3] th:TH_ATOMIC_RETURN -; GFX1250-GISEL-NEXT: s_wait_dscnt 0x0 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 -; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 +; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB82_2 ; GFX1250-GISEL-NEXT: .LBB82_4: ; %atomicrmw.private ; GFX1250-GISEL-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] -; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s1, v6 ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo @@ -8485,20 +8607,17 @@ define amdgpu_ps <2 x float> @flat_umin_saddr_i64_rtn_neg128(ptr inreg %sbase, i ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB83_4 ; GFX1250-SDAG-NEXT: .LBB83_2: ; %atomicrmw.phi ; GFX1250-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX1250-SDAG-NEXT: s_branch .LBB83_5 ; GFX1250-SDAG-NEXT: .LBB83_3: ; %atomicrmw.global ; GFX1250-SDAG-NEXT: flat_atomic_min_u64 v[0:1], v[4:5], v[2:3] th:TH_ATOMIC_RETURN -; GFX1250-SDAG-NEXT: s_wait_dscnt 0x0 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 +; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB83_2 ; GFX1250-SDAG-NEXT: .LBB83_4: ; %atomicrmw.private ; GFX1250-SDAG-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] -; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, s1, v4 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo @@ -8535,20 +8654,17 @@ define amdgpu_ps <2 x float> @flat_umin_saddr_i64_rtn_neg128(ptr inreg %sbase, i ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB83_4 ; GFX1250-GISEL-NEXT: .LBB83_2: ; %atomicrmw.phi ; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX1250-GISEL-NEXT: s_branch .LBB83_5 ; GFX1250-GISEL-NEXT: .LBB83_3: ; %atomicrmw.global ; GFX1250-GISEL-NEXT: flat_atomic_min_u64 v[0:1], v3, v[4:5], s[2:3] offset:-128 th:TH_ATOMIC_RETURN -; GFX1250-GISEL-NEXT: s_wait_dscnt 0x0 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 -; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 +; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB83_2 ; GFX1250-GISEL-NEXT: .LBB83_4: ; %atomicrmw.private ; GFX1250-GISEL-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] -; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s1, v6 ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo @@ -8681,9 +8797,9 @@ define amdgpu_ps void @flat_umin_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-SDAG-NEXT: s_endpgm ; GFX1250-SDAG-NEXT: .LBB84_3: ; %atomicrmw.global ; GFX1250-SDAG-NEXT: flat_atomic_min_u64 v[0:1], v[2:3] -; GFX1250-SDAG-NEXT: s_wait_dscnt 0x0 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX1250-SDAG-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB84_2 @@ -8720,9 +8836,9 @@ define amdgpu_ps void @flat_umin_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-GISEL-NEXT: s_endpgm ; GFX1250-GISEL-NEXT: .LBB84_3: ; %atomicrmw.global ; GFX1250-GISEL-NEXT: flat_atomic_min_u64 v0, v[4:5], s[2:3] -; GFX1250-GISEL-NEXT: s_wait_dscnt 0x0 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX1250-GISEL-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB84_2 @@ -8841,9 +8957,9 @@ define amdgpu_ps void @flat_umin_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %v ; GFX1250-SDAG-NEXT: s_endpgm ; GFX1250-SDAG-NEXT: .LBB85_3: ; %atomicrmw.global ; GFX1250-SDAG-NEXT: flat_atomic_min_u64 v[0:1], v[2:3] -; GFX1250-SDAG-NEXT: s_wait_dscnt 0x0 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX1250-SDAG-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB85_2 @@ -8883,9 +8999,9 @@ define amdgpu_ps void @flat_umin_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %v ; GFX1250-GISEL-NEXT: s_endpgm ; GFX1250-GISEL-NEXT: .LBB85_3: ; %atomicrmw.global ; GFX1250-GISEL-NEXT: flat_atomic_min_u64 v0, v[4:5], s[2:3] offset:-128 -; GFX1250-GISEL-NEXT: s_wait_dscnt 0x0 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX1250-GISEL-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB85_2 diff --git a/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll b/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll index 3856f0c327495..160b35352d8a4 100644 --- a/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll +++ b/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll @@ -1473,6 +1473,7 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat(ptr addrspace(1) %pt ; GFX1250-NEXT: v_mov_b32_e32 v2, 0 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 @@ -1513,6 +1514,9 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_agent(ptr addrspace( ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1250-NEXT: v_mov_b64_e32 v[0:1], 4.0 ; GFX1250-NEXT: v_mov_b32_e32 v2, 0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 @@ -1557,6 +1561,7 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_system(ptr addrspace ; GFX1250-NEXT: v_mov_b32_e32 v2, 0 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 @@ -1597,6 +1602,9 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_flush(ptr addrspace( ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1250-NEXT: v_mov_b64_e32 v[0:1], 4.0 ; GFX1250-NEXT: v_mov_b32_e32 v2, 0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 @@ -1673,6 +1681,7 @@ define double @global_atomic_fadd_f64_rtn_pat_agent(ptr addrspace(1) %ptr, doubl ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b64_e32 v[2:3], 4.0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_atomic_add_f64 v[0:1], v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt 0x0 @@ -1765,6 +1774,9 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_agent_safe(ptr addrs ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1250-NEXT: v_mov_b64_e32 v[0:1], 4.0 ; GFX1250-NEXT: v_mov_b32_e32 v2, 0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 @@ -1809,6 +1821,7 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat(ptr %ptr) #1 { ; GFX1250-NEXT: v_mov_b32_e32 v2, 0 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: flat_atomic_add_f64 v2, v[0:1], s[0:1] scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 @@ -1849,6 +1862,9 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_agent(ptr %ptr) #1 { ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1250-NEXT: v_mov_b64_e32 v[0:1], 4.0 ; GFX1250-NEXT: v_mov_b32_e32 v2, 0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: flat_atomic_add_f64 v2, v[0:1], s[0:1] scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 @@ -1893,6 +1909,7 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_system(ptr %ptr) #1 { ; GFX1250-NEXT: v_mov_b32_e32 v2, 0 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: flat_atomic_add_f64 v2, v[0:1], s[0:1] scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 @@ -1969,6 +1986,7 @@ define double @flat_atomic_fadd_f64_rtn_pat_agent(ptr %ptr) #1 { ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b64_e32 v[2:3], 4.0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: flat_atomic_add_f64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -2063,6 +2081,9 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_agent_safe(ptr %ptr) { ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1250-NEXT: v_mov_b64_e32 v[0:1], 4.0 ; GFX1250-NEXT: v_mov_b32_e32 v2, 0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: flat_atomic_add_f64 v2, v[0:1], s[0:1] scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 @@ -2136,6 +2157,7 @@ define double @local_atomic_fadd_f64_rtn(ptr addrspace(3) %ptr, double %data) { ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 +; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: ds_add_rtn_f64 v[0:1], v0, v[2:3] ; GFX1250-NEXT: s_wait_dscnt 0x0 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] @@ -2275,6 +2297,7 @@ define double @local_atomic_fadd_f64_rtn_pat(ptr addrspace(3) %ptr, double %data ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b64_e32 v[2:3], 4.0 +; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: ds_add_rtn_f64 v[0:1], v0, v[2:3] ; GFX1250-NEXT: s_wait_dscnt 0x0 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] @@ -2307,6 +2330,7 @@ define double @local_atomic_fadd_f64_rtn_ieee_unsafe(ptr addrspace(3) %ptr, doub ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 +; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: ds_add_rtn_f64 v[0:1], v0, v[2:3] ; GFX1250-NEXT: s_wait_dscnt 0x0 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] @@ -2339,6 +2363,7 @@ define double @local_atomic_fadd_f64_rtn_ieee_safe(ptr addrspace(3) %ptr, double ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 +; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: ds_add_rtn_f64 v[0:1], v0, v[2:3] ; GFX1250-NEXT: s_wait_dscnt 0x0 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence-mmra-global.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence-mmra-global.ll index 97d52d5f1f26d..209775314a505 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence-mmra-global.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence-mmra-global.ll @@ -80,9 +80,11 @@ define amdgpu_kernel void @workgroup_acquire_fence() { ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: workgroup_acquire_fence: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: workgroup_acquire_fence: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_endpgm entry: fence syncscope("workgroup") acquire, !mmra !{!"amdgpu-synchronize-as", !"global"} ret void @@ -151,9 +153,11 @@ define amdgpu_kernel void @workgroup_release_fence() { ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: workgroup_release_fence: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: workgroup_release_fence: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_endpgm entry: fence syncscope("workgroup") release, !mmra !{!"amdgpu-synchronize-as", !"global"} ret void @@ -227,9 +231,11 @@ define amdgpu_kernel void @workgroup_acq_rel_fence() { ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: workgroup_acq_rel_fence: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: workgroup_acq_rel_fence: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_endpgm entry: fence syncscope("workgroup") acq_rel, !mmra !{!"amdgpu-synchronize-as", !"global"} ret void @@ -303,9 +309,11 @@ define amdgpu_kernel void @workgroup_seq_cst_fence() { ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: workgroup_seq_cst_fence: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: workgroup_seq_cst_fence: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_endpgm entry: fence syncscope("workgroup") seq_cst, !mmra !{!"amdgpu-synchronize-as", !"global"} ret void @@ -377,9 +385,11 @@ define amdgpu_kernel void @workgroup_one_as_acquire_fence() { ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: workgroup_one_as_acquire_fence: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: workgroup_one_as_acquire_fence: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_endpgm entry: fence syncscope("workgroup-one-as") acquire, !mmra !{!"amdgpu-synchronize-as", !"global"} ret void @@ -448,9 +458,11 @@ define amdgpu_kernel void @workgroup_one_as_release_fence() { ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: workgroup_one_as_release_fence: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: workgroup_one_as_release_fence: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_endpgm entry: fence syncscope("workgroup-one-as") release, !mmra !{!"amdgpu-synchronize-as", !"global"} ret void @@ -524,9 +536,11 @@ define amdgpu_kernel void @workgroup_one_as_acq_rel_fence() { ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: workgroup_one_as_acq_rel_fence: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: workgroup_one_as_acq_rel_fence: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_endpgm entry: fence syncscope("workgroup-one-as") acq_rel, !mmra !{!"amdgpu-synchronize-as", !"global"} ret void @@ -600,9 +614,11 @@ define amdgpu_kernel void @workgroup_one_as_seq_cst_fence() { ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: workgroup_one_as_seq_cst_fence: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: workgroup_one_as_seq_cst_fence: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_endpgm entry: fence syncscope("workgroup-one-as") seq_cst, !mmra !{!"amdgpu-synchronize-as", !"global"} ret void @@ -785,13 +801,12 @@ define amdgpu_kernel void @agent_release_fence() { ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: agent_release_fence: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: agent_release_fence: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_endpgm entry: fence syncscope("agent") release, !mmra !{!"amdgpu-synchronize-as", !"global"} ret void @@ -891,14 +906,13 @@ define amdgpu_kernel void @agent_acq_rel_fence() { ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: agent_acq_rel_fence: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: agent_acq_rel_fence: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm entry: fence syncscope("agent") acq_rel, !mmra !{!"amdgpu-synchronize-as", !"global"} ret void @@ -998,14 +1012,13 @@ define amdgpu_kernel void @agent_seq_cst_fence() { ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: agent_seq_cst_fence: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: agent_seq_cst_fence: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm entry: fence syncscope("agent") seq_cst, !mmra !{!"amdgpu-synchronize-as", !"global"} ret void @@ -1188,13 +1201,12 @@ define amdgpu_kernel void @agent_one_as_release_fence() { ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: agent_one_as_release_fence: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: agent_one_as_release_fence: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_endpgm entry: fence syncscope("agent-one-as") release, !mmra !{!"amdgpu-synchronize-as", !"global"} ret void @@ -1294,14 +1306,13 @@ define amdgpu_kernel void @agent_one_as_acq_rel_fence() { ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: agent_one_as_acq_rel_fence: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: agent_one_as_acq_rel_fence: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm entry: fence syncscope("agent-one-as") acq_rel, !mmra !{!"amdgpu-synchronize-as", !"global"} ret void @@ -1401,14 +1412,13 @@ define amdgpu_kernel void @agent_one_as_seq_cst_fence() { ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: agent_one_as_seq_cst_fence: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: agent_one_as_seq_cst_fence: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm entry: fence syncscope("agent-one-as") seq_cst, !mmra !{!"amdgpu-synchronize-as", !"global"} ret void @@ -1597,14 +1607,12 @@ define amdgpu_kernel void @system_release_fence() { ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: system_release_fence: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: system_release_fence: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_endpgm entry: fence release, !mmra !{!"amdgpu-synchronize-as", !"global"} ret void @@ -1710,15 +1718,13 @@ define amdgpu_kernel void @system_acq_rel_fence() { ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: system_acq_rel_fence: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: system_acq_rel_fence: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_endpgm entry: fence acq_rel, !mmra !{!"amdgpu-synchronize-as", !"global"} ret void @@ -1824,15 +1830,13 @@ define amdgpu_kernel void @system_seq_cst_fence() { ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: system_seq_cst_fence: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: system_seq_cst_fence: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_endpgm entry: fence seq_cst, !mmra !{!"amdgpu-synchronize-as", !"global"} ret void @@ -2021,14 +2025,12 @@ define amdgpu_kernel void @system_one_as_release_fence() { ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: system_one_as_release_fence: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: system_one_as_release_fence: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_endpgm entry: fence syncscope("one-as") release, !mmra !{!"amdgpu-synchronize-as", !"global"} ret void @@ -2134,15 +2136,13 @@ define amdgpu_kernel void @system_one_as_acq_rel_fence() { ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: system_one_as_acq_rel_fence: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: system_one_as_acq_rel_fence: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_endpgm entry: fence syncscope("one-as") acq_rel, !mmra !{!"amdgpu-synchronize-as", !"global"} ret void @@ -2248,15 +2248,13 @@ define amdgpu_kernel void @system_one_as_seq_cst_fence() { ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: system_one_as_seq_cst_fence: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: system_one_as_seq_cst_fence: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_endpgm entry: fence syncscope("one-as") seq_cst, !mmra !{!"amdgpu-synchronize-as", !"global"} ret void diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence.ll index b3f6533d43887..07db15ee8e60e 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence.ll @@ -1064,10 +1064,11 @@ define amdgpu_kernel void @workgroup_acquire_fence() { ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: workgroup_acquire_fence: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: s_wait_dscnt 0x0 -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: workgroup_acquire_fence: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: s_endpgm entry: fence syncscope("workgroup") acquire ret void @@ -1144,10 +1145,11 @@ define amdgpu_kernel void @workgroup_release_fence() { ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: workgroup_release_fence: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: s_wait_dscnt 0x0 -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: workgroup_release_fence: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: s_endpgm entry: fence syncscope("workgroup") release ret void @@ -1229,10 +1231,11 @@ define amdgpu_kernel void @workgroup_acq_rel_fence() { ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: workgroup_acq_rel_fence: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: s_wait_dscnt 0x0 -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: workgroup_acq_rel_fence: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: s_endpgm entry: fence syncscope("workgroup") acq_rel ret void @@ -1314,10 +1317,11 @@ define amdgpu_kernel void @workgroup_seq_cst_fence() { ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: workgroup_seq_cst_fence: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: s_wait_dscnt 0x0 -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: workgroup_seq_cst_fence: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: s_endpgm entry: fence syncscope("workgroup") seq_cst ret void @@ -1389,9 +1393,11 @@ define amdgpu_kernel void @workgroup_one_as_acquire_fence() { ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: workgroup_one_as_acquire_fence: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: workgroup_one_as_acquire_fence: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_endpgm entry: fence syncscope("workgroup-one-as") acquire ret void @@ -1460,9 +1466,11 @@ define amdgpu_kernel void @workgroup_one_as_release_fence() { ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: workgroup_one_as_release_fence: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: workgroup_one_as_release_fence: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_endpgm entry: fence syncscope("workgroup-one-as") release ret void @@ -1536,9 +1544,11 @@ define amdgpu_kernel void @workgroup_one_as_acq_rel_fence() { ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: workgroup_one_as_acq_rel_fence: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: workgroup_one_as_acq_rel_fence: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_endpgm entry: fence syncscope("workgroup-one-as") acq_rel ret void @@ -1612,9 +1622,11 @@ define amdgpu_kernel void @workgroup_one_as_seq_cst_fence() { ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: workgroup_one_as_seq_cst_fence: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: workgroup_one_as_seq_cst_fence: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_endpgm entry: fence syncscope("workgroup-one-as") seq_cst ret void @@ -1797,13 +1809,12 @@ define amdgpu_kernel void @agent_release_fence() { ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: agent_release_fence: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: agent_release_fence: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: s_endpgm entry: fence syncscope("agent") release ret void @@ -1903,14 +1914,13 @@ define amdgpu_kernel void @agent_acq_rel_fence() { ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: agent_acq_rel_fence: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: agent_acq_rel_fence: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm entry: fence syncscope("agent") acq_rel ret void @@ -2010,14 +2020,13 @@ define amdgpu_kernel void @agent_seq_cst_fence() { ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: agent_seq_cst_fence: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: agent_seq_cst_fence: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm entry: fence syncscope("agent") seq_cst ret void @@ -2200,13 +2209,12 @@ define amdgpu_kernel void @agent_one_as_release_fence() { ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: agent_one_as_release_fence: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: agent_one_as_release_fence: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_endpgm entry: fence syncscope("agent-one-as") release ret void @@ -2306,14 +2314,13 @@ define amdgpu_kernel void @agent_one_as_acq_rel_fence() { ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: agent_one_as_acq_rel_fence: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: agent_one_as_acq_rel_fence: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm entry: fence syncscope("agent-one-as") acq_rel ret void @@ -2413,14 +2420,13 @@ define amdgpu_kernel void @agent_one_as_seq_cst_fence() { ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: agent_one_as_seq_cst_fence: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: agent_one_as_seq_cst_fence: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm entry: fence syncscope("agent-one-as") seq_cst ret void @@ -2609,14 +2615,12 @@ define amdgpu_kernel void @system_release_fence() { ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: system_release_fence: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: system_release_fence: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: s_endpgm entry: fence release ret void @@ -2722,15 +2726,13 @@ define amdgpu_kernel void @system_acq_rel_fence() { ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: system_acq_rel_fence: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: system_acq_rel_fence: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_endpgm entry: fence acq_rel ret void @@ -2836,15 +2838,13 @@ define amdgpu_kernel void @system_seq_cst_fence() { ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: system_seq_cst_fence: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: system_seq_cst_fence: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_endpgm entry: fence seq_cst ret void @@ -3033,14 +3033,12 @@ define amdgpu_kernel void @system_one_as_release_fence() { ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: system_one_as_release_fence: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: system_one_as_release_fence: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_endpgm entry: fence syncscope("one-as") release ret void @@ -3146,15 +3144,13 @@ define amdgpu_kernel void @system_one_as_acq_rel_fence() { ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: system_one_as_acq_rel_fence: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: system_one_as_acq_rel_fence: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_endpgm entry: fence syncscope("one-as") acq_rel ret void @@ -3260,15 +3256,13 @@ define amdgpu_kernel void @system_one_as_seq_cst_fence() { ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: system_one_as_seq_cst_fence: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: system_one_as_seq_cst_fence: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_endpgm entry: fence syncscope("one-as") seq_cst ret void diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-agent.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-agent.ll index 36adbc0011118..fe7fd8522bd6a 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-agent.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-agent.ll @@ -825,23 +825,19 @@ define amdgpu_kernel void @flat_agent_seq_cst_load( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_agent_seq_cst_load: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: flat_load_b32 v1, v0, s[2:3] scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_agent_seq_cst_load: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: flat_load_b32 v1, v0, s[2:3] scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %in, ptr %out) { entry: %val = load atomic i32, ptr %in syncscope("agent") seq_cst, align 4 @@ -993,15 +989,16 @@ define amdgpu_kernel void @flat_agent_unordered_store( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_agent_unordered_store: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_agent_unordered_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr %out) { entry: store atomic i32 %in, ptr %out syncscope("agent") unordered, align 4 @@ -1152,15 +1149,16 @@ define amdgpu_kernel void @flat_agent_monotonic_store( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_agent_monotonic_store: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_agent_monotonic_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr %out) { entry: store atomic i32 %in, ptr %out syncscope("agent") monotonic, align 4 @@ -1335,19 +1333,19 @@ define amdgpu_kernel void @flat_agent_release_store( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_agent_release_store: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_agent_release_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr %out) { entry: store atomic i32 %in, ptr %out syncscope("agent") release, align 4 @@ -1522,19 +1520,19 @@ define amdgpu_kernel void @flat_agent_seq_cst_store( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_agent_seq_cst_store: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_agent_seq_cst_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr %out) { entry: store atomic i32 %in, ptr %out syncscope("agent") seq_cst, align 4 @@ -1685,15 +1683,16 @@ define amdgpu_kernel void @flat_agent_monotonic_atomicrmw( ; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_agent_monotonic_atomicrmw: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_agent_monotonic_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("agent") monotonic @@ -1875,17 +1874,18 @@ define amdgpu_kernel void @flat_agent_acquire_atomicrmw( ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_agent_acquire_atomicrmw: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 -; GFX1250-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_agent_acquire_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("agent") acquire @@ -2060,19 +2060,19 @@ define amdgpu_kernel void @flat_agent_release_atomicrmw( ; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_agent_release_atomicrmw: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_agent_release_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("agent") release @@ -2278,21 +2278,21 @@ define amdgpu_kernel void @flat_agent_acq_rel_atomicrmw( ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_agent_acq_rel_atomicrmw: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 -; GFX1250-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_agent_acq_rel_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("agent") acq_rel @@ -2498,21 +2498,21 @@ define amdgpu_kernel void @flat_agent_seq_cst_atomicrmw( ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_agent_seq_cst_atomicrmw: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 -; GFX1250-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_agent_seq_cst_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("agent") seq_cst @@ -2722,18 +2722,19 @@ define amdgpu_kernel void @flat_agent_acquire_ret_atomicrmw( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_agent_acquire_ret_atomicrmw: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_agent_acquire_ret_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("agent") acquire @@ -2972,24 +2973,22 @@ define amdgpu_kernel void @flat_agent_acq_rel_ret_atomicrmw( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_agent_acq_rel_ret_atomicrmw: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_agent_acq_rel_ret_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("agent") acq_rel @@ -3228,24 +3227,22 @@ define amdgpu_kernel void @flat_agent_seq_cst_ret_atomicrmw( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_agent_seq_cst_ret_atomicrmw: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_agent_seq_cst_ret_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("agent") seq_cst @@ -3485,19 +3482,20 @@ define amdgpu_kernel void @flat_agent_monotonic_monotonic_cmpxchg( ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_agent_monotonic_monotonic_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_agent_monotonic_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -3768,21 +3766,22 @@ define amdgpu_kernel void @flat_agent_acquire_monotonic_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_agent_acquire_monotonic_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 -; GFX1250-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_agent_acquire_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -4046,23 +4045,23 @@ define amdgpu_kernel void @flat_agent_release_monotonic_cmpxchg( ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_agent_release_monotonic_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_agent_release_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -4357,25 +4356,25 @@ define amdgpu_kernel void @flat_agent_acq_rel_monotonic_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_agent_acq_rel_monotonic_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 -; GFX1250-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_agent_acq_rel_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -4670,25 +4669,25 @@ define amdgpu_kernel void @flat_agent_seq_cst_monotonic_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_agent_seq_cst_monotonic_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 -; GFX1250-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_agent_seq_cst_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -4959,21 +4958,22 @@ define amdgpu_kernel void @flat_agent_monotonic_acquire_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_agent_monotonic_acquire_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 -; GFX1250-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_agent_monotonic_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -5244,21 +5244,22 @@ define amdgpu_kernel void @flat_agent_acquire_acquire_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_agent_acquire_acquire_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 -; GFX1250-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_agent_acquire_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -5553,25 +5554,25 @@ define amdgpu_kernel void @flat_agent_release_acquire_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_agent_release_acquire_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 -; GFX1250-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_agent_release_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -5866,25 +5867,25 @@ define amdgpu_kernel void @flat_agent_acq_rel_acquire_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_agent_acq_rel_acquire_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 -; GFX1250-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_agent_acq_rel_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -6179,25 +6180,25 @@ define amdgpu_kernel void @flat_agent_seq_cst_acquire_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_agent_seq_cst_acquire_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 -; GFX1250-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_agent_seq_cst_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -6492,25 +6493,25 @@ define amdgpu_kernel void @flat_agent_monotonic_seq_cst_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_agent_monotonic_seq_cst_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 -; GFX1250-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_agent_monotonic_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -6805,25 +6806,25 @@ define amdgpu_kernel void @flat_agent_acquire_seq_cst_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_agent_acquire_seq_cst_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 -; GFX1250-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_agent_acquire_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -7118,25 +7119,25 @@ define amdgpu_kernel void @flat_agent_release_seq_cst_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_agent_release_seq_cst_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 -; GFX1250-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_agent_release_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -7431,25 +7432,25 @@ define amdgpu_kernel void @flat_agent_acq_rel_seq_cst_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_agent_acq_rel_seq_cst_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 -; GFX1250-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_agent_acq_rel_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -7744,25 +7745,25 @@ define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_agent_seq_cst_seq_cst_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 -; GFX1250-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_agent_seq_cst_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -8046,21 +8047,22 @@ define amdgpu_kernel void @flat_agent_monotonic_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_agent_monotonic_monotonic_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_agent_monotonic_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -8361,22 +8363,23 @@ define amdgpu_kernel void @flat_agent_acquire_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_agent_acquire_monotonic_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_agent_acquire_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -8686,25 +8689,25 @@ define amdgpu_kernel void @flat_agent_release_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_agent_release_monotonic_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_agent_release_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -9033,28 +9036,26 @@ define amdgpu_kernel void @flat_agent_acq_rel_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_agent_acq_rel_monotonic_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_agent_acq_rel_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -9383,28 +9384,26 @@ define amdgpu_kernel void @flat_agent_seq_cst_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_agent_seq_cst_monotonic_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_agent_seq_cst_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -9709,24 +9708,23 @@ define amdgpu_kernel void @flat_agent_monotonic_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_agent_monotonic_acquire_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_agent_monotonic_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -10027,22 +10025,23 @@ define amdgpu_kernel void @flat_agent_acquire_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_agent_acquire_acquire_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_agent_acquire_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -10371,28 +10370,26 @@ define amdgpu_kernel void @flat_agent_release_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_agent_release_acquire_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_agent_release_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -10721,28 +10718,26 @@ define amdgpu_kernel void @flat_agent_acq_rel_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_agent_acq_rel_acquire_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_agent_acq_rel_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -11071,28 +11066,26 @@ define amdgpu_kernel void @flat_agent_seq_cst_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_agent_seq_cst_acquire_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_agent_seq_cst_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -11421,28 +11414,26 @@ define amdgpu_kernel void @flat_agent_monotonic_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_agent_monotonic_seq_cst_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_agent_monotonic_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -11767,26 +11758,26 @@ define amdgpu_kernel void @flat_agent_acquire_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_agent_acquire_seq_cst_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_agent_acquire_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -12115,28 +12106,26 @@ define amdgpu_kernel void @flat_agent_release_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_agent_release_seq_cst_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_agent_release_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -12465,28 +12454,26 @@ define amdgpu_kernel void @flat_agent_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_agent_acq_rel_seq_cst_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_agent_acq_rel_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -12815,28 +12802,26 @@ define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_agent_seq_cst_seq_cst_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_agent_seq_cst_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -13679,24 +13664,20 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_load( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_agent_one_as_seq_cst_load: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: flat_load_b32 v1, v0, s[2:3] scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_dscnt 0x0 -; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_agent_one_as_seq_cst_load: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: flat_load_b32 v1, v0, s[2:3] scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %in, ptr %out) { entry: %val = load atomic i32, ptr %in syncscope("agent-one-as") seq_cst, align 4 @@ -13848,15 +13829,16 @@ define amdgpu_kernel void @flat_agent_one_as_unordered_store( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_agent_one_as_unordered_store: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_agent_one_as_unordered_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr %out) { entry: store atomic i32 %in, ptr %out syncscope("agent-one-as") unordered, align 4 @@ -14007,15 +13989,16 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_store( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_agent_one_as_monotonic_store: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_agent_one_as_monotonic_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr %out) { entry: store atomic i32 %in, ptr %out syncscope("agent-one-as") monotonic, align 4 @@ -14190,19 +14173,19 @@ define amdgpu_kernel void @flat_agent_one_as_release_store( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_agent_one_as_release_store: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_agent_one_as_release_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr %out) { entry: store atomic i32 %in, ptr %out syncscope("agent-one-as") release, align 4 @@ -14377,19 +14360,19 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_store( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_agent_one_as_seq_cst_store: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_agent_one_as_seq_cst_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr %out) { entry: store atomic i32 %in, ptr %out syncscope("agent-one-as") seq_cst, align 4 @@ -14540,15 +14523,16 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_atomicrmw( ; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_agent_one_as_monotonic_atomicrmw: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_agent_one_as_monotonic_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("agent-one-as") monotonic @@ -14724,19 +14708,20 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_atomicrmw( ; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV -; GFX12-CU-NEXT: s_endpgm -; -; GFX1250-LABEL: flat_agent_one_as_acquire_atomicrmw: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-NEXT: s_endpgm +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_agent_one_as_acquire_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("agent-one-as") acquire @@ -14911,19 +14896,19 @@ define amdgpu_kernel void @flat_agent_one_as_release_atomicrmw( ; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_agent_one_as_release_atomicrmw: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_agent_one_as_release_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("agent-one-as") release @@ -15125,21 +15110,21 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_atomicrmw( ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_agent_one_as_acq_rel_atomicrmw: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_agent_one_as_acq_rel_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("agent-one-as") acq_rel @@ -15341,21 +15326,21 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_atomicrmw( ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_agent_one_as_seq_cst_atomicrmw: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_agent_one_as_seq_cst_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("agent-one-as") seq_cst @@ -15575,19 +15560,20 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_ret_atomicrmw( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_agent_one_as_acquire_ret_atomicrmw: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_dscnt 0x0 -; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_agent_one_as_acquire_ret_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("agent-one-as") acquire @@ -15836,25 +15822,23 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_ret_atomicrmw( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_agent_one_as_acq_rel_ret_atomicrmw: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_dscnt 0x0 -; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_agent_one_as_acq_rel_ret_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("agent-one-as") acq_rel @@ -16103,25 +16087,23 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_ret_atomicrmw( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_agent_one_as_seq_cst_ret_atomicrmw: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_dscnt 0x0 -; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_agent_one_as_seq_cst_ret_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("agent-one-as") seq_cst @@ -16361,19 +16343,20 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_monotonic_cmpxchg( ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_agent_one_as_monotonic_monotonic_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_agent_one_as_monotonic_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -16640,21 +16623,22 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_monotonic_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_agent_one_as_acquire_monotonic_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_agent_one_as_acquire_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -16918,23 +16902,23 @@ define amdgpu_kernel void @flat_agent_one_as_release_monotonic_cmpxchg( ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_agent_one_as_release_monotonic_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_agent_one_as_release_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -17225,25 +17209,25 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_agent_one_as_acq_rel_monotonic_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_agent_one_as_acq_rel_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -17534,25 +17518,25 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_agent_one_as_seq_cst_monotonic_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_agent_one_as_seq_cst_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -17819,21 +17803,22 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_acquire_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_agent_one_as_monotonic_acquire_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_agent_one_as_monotonic_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -18100,21 +18085,22 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_acquire_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_agent_one_as_acquire_acquire_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_agent_one_as_acquire_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -18405,25 +18391,25 @@ define amdgpu_kernel void @flat_agent_one_as_release_acquire_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_agent_one_as_release_acquire_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_agent_one_as_release_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -18714,25 +18700,25 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_agent_one_as_acq_rel_acquire_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_agent_one_as_acq_rel_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -19023,25 +19009,25 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_agent_one_as_seq_cst_acquire_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_agent_one_as_seq_cst_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -19332,25 +19318,25 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_seq_cst_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_agent_one_as_monotonic_seq_cst_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_agent_one_as_monotonic_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -19641,25 +19627,25 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_seq_cst_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_agent_one_as_acquire_seq_cst_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_agent_one_as_acquire_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -19950,25 +19936,25 @@ define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_agent_one_as_release_seq_cst_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_agent_one_as_release_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -20259,25 +20245,25 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_agent_one_as_acq_rel_seq_cst_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_agent_one_as_acq_rel_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -20568,25 +20554,25 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_agent_one_as_seq_cst_seq_cst_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_agent_one_as_seq_cst_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -20870,21 +20856,22 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_agent_one_as_monotonic_monotonic_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_agent_one_as_monotonic_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -21195,23 +21182,24 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_agent_one_as_acquire_monotonic_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_dscnt 0x0 -; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_agent_one_as_acquire_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -21521,25 +21509,25 @@ define amdgpu_kernel void @flat_agent_one_as_release_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_agent_one_as_release_monotonic_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_agent_one_as_release_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -21878,29 +21866,27 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_dscnt 0x0 -; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -22239,29 +22225,27 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_dscnt 0x0 -; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -22576,25 +22560,24 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_agent_one_as_monotonic_acquire_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_dscnt 0x0 -; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_agent_one_as_monotonic_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -22905,23 +22888,24 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_agent_one_as_acquire_acquire_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_dscnt 0x0 -; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_agent_one_as_acquire_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -23260,29 +23244,27 @@ define amdgpu_kernel void @flat_agent_one_as_release_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_agent_one_as_release_acquire_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_dscnt 0x0 -; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_agent_one_as_release_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -23621,29 +23603,27 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_agent_one_as_acq_rel_acquire_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_dscnt 0x0 -; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_agent_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -23982,29 +23962,27 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_agent_one_as_seq_cst_acquire_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_dscnt 0x0 -; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_agent_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -24343,29 +24321,27 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_agent_one_as_monotonic_seq_cst_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_dscnt 0x0 -; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_agent_one_as_monotonic_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -24700,27 +24676,27 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_agent_one_as_acquire_seq_cst_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_dscnt 0x0 -; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_agent_one_as_acquire_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -25059,29 +25035,27 @@ define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_agent_one_as_release_seq_cst_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_dscnt 0x0 -; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_agent_one_as_release_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -25420,29 +25394,27 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_agent_one_as_acq_rel_seq_cst_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_dscnt 0x0 -; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_agent_one_as_acq_rel_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -25781,29 +25753,27 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_dscnt 0x0 -; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-lastuse.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-lastuse.ll index 5526b29037977..22c1b6f9fe875 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-lastuse.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-lastuse.ll @@ -107,18 +107,16 @@ define amdgpu_kernel void @flat_last_use_and_volatile_load(ptr %in, ptr %out) { ; GFX12-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_last_use_and_volatile_load: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: flat_load_b32 v1, v0, s[2:3] th:TH_LOAD_BYPASS scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_last_use_and_volatile_load: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: flat_load_b32 v1, v0, s[2:3] th:TH_LOAD_BYPASS scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm entry: %val = load volatile i32, ptr %in, align 4, !amdgpu.last.use !{} store i32 %val, ptr %out diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-nontemporal.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-nontemporal.ll index 964f1c8957f6f..c949790b97d72 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-nontemporal.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-nontemporal.ll @@ -1322,18 +1322,16 @@ define amdgpu_kernel void @flat_nontemporal_volatile_load( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_nontemporal_volatile_load: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: flat_load_b32 v1, v0, s[2:3] th:TH_LOAD_NT scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_nontemporal_volatile_load: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: flat_load_b32 v1, v0, s[2:3] th:TH_LOAD_NT scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %in, ptr %out) { entry: %val = load volatile i32, ptr %in, align 4, !nontemporal !0 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-singlethread.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-singlethread.ll index 871c941dd6dca..8a75db2c36dc7 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-singlethread.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-singlethread.ll @@ -929,15 +929,16 @@ define amdgpu_kernel void @flat_singlethread_unordered_store( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_singlethread_unordered_store: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_singlethread_unordered_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr %out) { entry: store atomic i32 %in, ptr %out syncscope("singlethread") unordered, align 4 @@ -1088,15 +1089,16 @@ define amdgpu_kernel void @flat_singlethread_monotonic_store( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_singlethread_monotonic_store: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_singlethread_monotonic_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr %out) { entry: store atomic i32 %in, ptr %out syncscope("singlethread") monotonic, align 4 @@ -1247,15 +1249,16 @@ define amdgpu_kernel void @flat_singlethread_release_store( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_singlethread_release_store: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_singlethread_release_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr %out) { entry: store atomic i32 %in, ptr %out syncscope("singlethread") release, align 4 @@ -1406,15 +1409,16 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_store( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_singlethread_seq_cst_store: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_singlethread_seq_cst_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr %out) { entry: store atomic i32 %in, ptr %out syncscope("singlethread") seq_cst, align 4 @@ -1565,15 +1569,16 @@ define amdgpu_kernel void @flat_singlethread_monotonic_atomicrmw( ; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_singlethread_monotonic_atomicrmw: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_singlethread_monotonic_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("singlethread") monotonic @@ -1724,15 +1729,16 @@ define amdgpu_kernel void @flat_singlethread_acquire_atomicrmw( ; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_singlethread_acquire_atomicrmw: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_singlethread_acquire_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("singlethread") acquire @@ -1883,15 +1889,16 @@ define amdgpu_kernel void @flat_singlethread_release_atomicrmw( ; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_singlethread_release_atomicrmw: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_singlethread_release_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("singlethread") release @@ -2042,15 +2049,16 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_atomicrmw( ; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_singlethread_acq_rel_atomicrmw: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_singlethread_acq_rel_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("singlethread") acq_rel @@ -2201,15 +2209,16 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_atomicrmw( ; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_singlethread_seq_cst_atomicrmw: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_singlethread_seq_cst_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("singlethread") seq_cst @@ -2404,17 +2413,18 @@ define amdgpu_kernel void @flat_singlethread_acquire_ret_atomicrmw( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_singlethread_acquire_ret_atomicrmw: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_singlethread_acquire_ret_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("singlethread") acquire @@ -2610,17 +2620,18 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_ret_atomicrmw( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_singlethread_acq_rel_ret_atomicrmw: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_singlethread_acq_rel_ret_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("singlethread") acq_rel @@ -2816,17 +2827,18 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_ret_atomicrmw( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_singlethread_seq_cst_ret_atomicrmw: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_singlethread_seq_cst_ret_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("singlethread") seq_cst @@ -3066,19 +3078,20 @@ define amdgpu_kernel void @flat_singlethread_monotonic_monotonic_cmpxchg( ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_singlethread_monotonic_monotonic_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_singlethread_monotonic_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -3318,19 +3331,20 @@ define amdgpu_kernel void @flat_singlethread_acquire_monotonic_cmpxchg( ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_singlethread_acquire_monotonic_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_singlethread_acquire_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -3570,19 +3584,20 @@ define amdgpu_kernel void @flat_singlethread_release_monotonic_cmpxchg( ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_singlethread_release_monotonic_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_singlethread_release_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -3822,19 +3837,20 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_cmpxchg( ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_singlethread_acq_rel_monotonic_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_singlethread_acq_rel_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -4074,19 +4090,20 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_cmpxchg( ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_singlethread_seq_cst_monotonic_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_singlethread_seq_cst_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -4326,19 +4343,20 @@ define amdgpu_kernel void @flat_singlethread_monotonic_acquire_cmpxchg( ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_singlethread_monotonic_acquire_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_singlethread_monotonic_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -4578,19 +4596,20 @@ define amdgpu_kernel void @flat_singlethread_acquire_acquire_cmpxchg( ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_singlethread_acquire_acquire_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_singlethread_acquire_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -4830,19 +4849,20 @@ define amdgpu_kernel void @flat_singlethread_release_acquire_cmpxchg( ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_singlethread_release_acquire_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_singlethread_release_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -5082,19 +5102,20 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_cmpxchg( ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_singlethread_acq_rel_acquire_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_singlethread_acq_rel_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -5334,19 +5355,20 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_cmpxchg( ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_singlethread_seq_cst_acquire_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_singlethread_seq_cst_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -5586,19 +5608,20 @@ define amdgpu_kernel void @flat_singlethread_monotonic_seq_cst_cmpxchg( ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_singlethread_monotonic_seq_cst_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_singlethread_monotonic_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -5838,19 +5861,20 @@ define amdgpu_kernel void @flat_singlethread_acquire_seq_cst_cmpxchg( ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_singlethread_acquire_seq_cst_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_singlethread_acquire_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -6090,19 +6114,20 @@ define amdgpu_kernel void @flat_singlethread_release_seq_cst_cmpxchg( ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_singlethread_release_seq_cst_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_singlethread_release_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -6342,19 +6367,20 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_seq_cst_cmpxchg( ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_singlethread_acq_rel_seq_cst_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_singlethread_acq_rel_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -6594,19 +6620,20 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_cmpxchg( ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_singlethread_seq_cst_seq_cst_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_singlethread_seq_cst_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -6890,21 +6917,22 @@ define amdgpu_kernel void @flat_singlethread_monotonic_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_singlethread_monotonic_monotonic_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_singlethread_monotonic_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -7190,21 +7218,22 @@ define amdgpu_kernel void @flat_singlethread_acquire_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_singlethread_acquire_monotonic_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_singlethread_acquire_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -7490,21 +7519,22 @@ define amdgpu_kernel void @flat_singlethread_release_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_singlethread_release_monotonic_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_singlethread_release_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -7790,21 +7820,22 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_singlethread_acq_rel_monotonic_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_singlethread_acq_rel_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -8090,21 +8121,22 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_singlethread_seq_cst_monotonic_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_singlethread_seq_cst_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -8390,21 +8422,22 @@ define amdgpu_kernel void @flat_singlethread_monotonic_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_singlethread_monotonic_acquire_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_singlethread_monotonic_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -8690,21 +8723,22 @@ define amdgpu_kernel void @flat_singlethread_acquire_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_singlethread_acquire_acquire_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_singlethread_acquire_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -8990,21 +9024,22 @@ define amdgpu_kernel void @flat_singlethread_release_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_singlethread_release_acquire_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_singlethread_release_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -9290,21 +9325,22 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_singlethread_acq_rel_acquire_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_singlethread_acq_rel_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -9590,21 +9626,22 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_singlethread_seq_cst_acquire_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_singlethread_seq_cst_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -9890,21 +9927,22 @@ define amdgpu_kernel void @flat_singlethread_monotonic_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_singlethread_monotonic_seq_cst_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_singlethread_monotonic_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -10190,21 +10228,22 @@ define amdgpu_kernel void @flat_singlethread_acquire_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_singlethread_acquire_seq_cst_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_singlethread_acquire_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -10490,21 +10529,22 @@ define amdgpu_kernel void @flat_singlethread_release_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_singlethread_release_seq_cst_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_singlethread_release_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -10790,21 +10830,22 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_singlethread_acq_rel_seq_cst_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_singlethread_acq_rel_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -11090,21 +11131,22 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_singlethread_seq_cst_seq_cst_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_singlethread_seq_cst_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -12030,15 +12072,16 @@ define amdgpu_kernel void @flat_singlethread_one_as_unordered_store( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_singlethread_one_as_unordered_store: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_singlethread_one_as_unordered_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr %out) { entry: store atomic i32 %in, ptr %out syncscope("singlethread-one-as") unordered, align 4 @@ -12189,15 +12232,16 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_store( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_singlethread_one_as_monotonic_store: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_singlethread_one_as_monotonic_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr %out) { entry: store atomic i32 %in, ptr %out syncscope("singlethread-one-as") monotonic, align 4 @@ -12348,15 +12392,16 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_store( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_singlethread_one_as_release_store: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_singlethread_one_as_release_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr %out) { entry: store atomic i32 %in, ptr %out syncscope("singlethread-one-as") release, align 4 @@ -12507,15 +12552,16 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_store( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_singlethread_one_as_seq_cst_store: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_singlethread_one_as_seq_cst_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr %out) { entry: store atomic i32 %in, ptr %out syncscope("singlethread-one-as") seq_cst, align 4 @@ -12666,15 +12712,16 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_atomicrmw( ; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_singlethread_one_as_monotonic_atomicrmw: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_singlethread_one_as_monotonic_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("singlethread-one-as") monotonic @@ -12825,15 +12872,16 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_atomicrmw( ; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_singlethread_one_as_acquire_atomicrmw: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_singlethread_one_as_acquire_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("singlethread-one-as") acquire @@ -12984,15 +13032,16 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_atomicrmw( ; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_singlethread_one_as_release_atomicrmw: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_singlethread_one_as_release_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("singlethread-one-as") release @@ -13143,15 +13192,16 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_atomicrmw( ; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_singlethread_one_as_acq_rel_atomicrmw: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_singlethread_one_as_acq_rel_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("singlethread-one-as") acq_rel @@ -13302,15 +13352,16 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_atomicrmw( ; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_singlethread_one_as_seq_cst_atomicrmw: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_singlethread_one_as_seq_cst_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("singlethread-one-as") seq_cst @@ -13505,17 +13556,18 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_ret_atomicrmw( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_singlethread_one_as_acquire_ret_atomicrmw: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_singlethread_one_as_acquire_ret_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("singlethread-one-as") acquire @@ -13711,17 +13763,18 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_ret_atomicrmw( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_singlethread_one_as_acq_rel_ret_atomicrmw: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_singlethread_one_as_acq_rel_ret_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("singlethread-one-as") acq_rel @@ -13917,17 +13970,18 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_ret_atomicrmw( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_singlethread_one_as_seq_cst_ret_atomicrmw: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_singlethread_one_as_seq_cst_ret_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("singlethread-one-as") seq_cst @@ -14167,19 +14221,20 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_monotonic_cmpxchg( ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_singlethread_one_as_monotonic_monotonic_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_singlethread_one_as_monotonic_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -14419,19 +14474,20 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_cmpxchg( ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_singlethread_one_as_acquire_monotonic_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_singlethread_one_as_acquire_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -14671,19 +14727,20 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_cmpxchg( ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_singlethread_one_as_release_monotonic_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_singlethread_one_as_release_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -14923,19 +14980,20 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_cmpxchg( ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_singlethread_one_as_acq_rel_monotonic_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_singlethread_one_as_acq_rel_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -15175,19 +15233,20 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_cmpxchg( ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_singlethread_one_as_seq_cst_monotonic_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_singlethread_one_as_seq_cst_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -15427,19 +15486,20 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_acquire_cmpxchg( ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_singlethread_one_as_monotonic_acquire_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_singlethread_one_as_monotonic_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -15679,19 +15739,20 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_cmpxchg( ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_singlethread_one_as_acquire_acquire_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_singlethread_one_as_acquire_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -15931,19 +15992,20 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_cmpxchg( ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_singlethread_one_as_release_acquire_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_singlethread_one_as_release_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -16183,19 +16245,20 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_cmpxchg( ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_singlethread_one_as_acq_rel_acquire_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_singlethread_one_as_acq_rel_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -16435,19 +16498,20 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_cmpxchg( ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_singlethread_one_as_seq_cst_acquire_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_singlethread_one_as_seq_cst_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -16687,19 +16751,20 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_seq_cst_cmpxchg( ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_singlethread_one_as_monotonic_seq_cst_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_singlethread_one_as_monotonic_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -16939,19 +17004,20 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_seq_cst_cmpxchg( ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_singlethread_one_as_acquire_seq_cst_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_singlethread_one_as_acquire_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -17191,19 +17257,20 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_seq_cst_cmpxchg( ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_singlethread_one_as_release_seq_cst_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_singlethread_one_as_release_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -17443,19 +17510,20 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg( ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -17695,19 +17763,20 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg( ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -17991,21 +18060,22 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_monotonic_ret_cmpx ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_singlethread_one_as_monotonic_monotonic_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_singlethread_one_as_monotonic_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -18291,21 +18361,22 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_ret_cmpxch ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_singlethread_one_as_acquire_monotonic_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_singlethread_one_as_acquire_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -18591,21 +18662,22 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_ret_cmpxch ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_singlethread_one_as_release_monotonic_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_singlethread_one_as_release_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -18891,21 +18963,22 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxch ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -19191,21 +19264,22 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxch ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -19491,21 +19565,22 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_acquire_ret_cmpxch ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_singlethread_one_as_monotonic_acquire_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_singlethread_one_as_monotonic_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -19791,21 +19866,22 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_singlethread_one_as_acquire_acquire_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_singlethread_one_as_acquire_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -20091,21 +20167,22 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_singlethread_one_as_release_acquire_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_singlethread_one_as_release_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -20391,21 +20468,22 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -20691,21 +20769,22 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -20991,21 +21070,22 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxch ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -21291,21 +21371,22 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -21591,21 +21672,22 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_singlethread_one_as_release_seq_cst_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_singlethread_one_as_release_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -21891,21 +21973,22 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -22191,21 +22274,22 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-system.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-system.ll index 9d70a2437e553..b5ea23d4655b6 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-system.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-system.ll @@ -829,23 +829,19 @@ define amdgpu_kernel void @flat_system_seq_cst_load( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_system_seq_cst_load: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: flat_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_system_seq_cst_load: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: flat_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %in, ptr %out) { entry: %val = load atomic i32, ptr %in seq_cst, align 4 @@ -997,15 +993,16 @@ define amdgpu_kernel void @flat_system_unordered_store( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_system_unordered_store: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_system_unordered_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr %out) { entry: store atomic i32 %in, ptr %out unordered, align 4 @@ -1156,15 +1153,16 @@ define amdgpu_kernel void @flat_system_monotonic_store( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_system_monotonic_store: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_system_monotonic_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr %out) { entry: store atomic i32 %in, ptr %out monotonic, align 4 @@ -1343,20 +1341,19 @@ define amdgpu_kernel void @flat_system_release_store( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_system_release_store: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_system_release_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr %out) { entry: store atomic i32 %in, ptr %out release, align 4 @@ -1535,20 +1532,19 @@ define amdgpu_kernel void @flat_system_seq_cst_store( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_system_seq_cst_store: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_system_seq_cst_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr %out) { entry: store atomic i32 %in, ptr %out seq_cst, align 4 @@ -1699,15 +1695,16 @@ define amdgpu_kernel void @flat_system_monotonic_atomicrmw( ; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_system_monotonic_atomicrmw: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_system_monotonic_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in monotonic @@ -1891,17 +1888,18 @@ define amdgpu_kernel void @flat_system_acquire_atomicrmw( ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_system_acquire_atomicrmw: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 -; GFX1250-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_system_acquire_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in acquire @@ -2080,20 +2078,19 @@ define amdgpu_kernel void @flat_system_release_atomicrmw( ; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_system_release_atomicrmw: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_system_release_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in release @@ -2305,22 +2302,21 @@ define amdgpu_kernel void @flat_system_acq_rel_atomicrmw( ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_system_acq_rel_atomicrmw: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 -; GFX1250-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_system_acq_rel_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in acq_rel @@ -2532,22 +2528,21 @@ define amdgpu_kernel void @flat_system_seq_cst_atomicrmw( ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_system_seq_cst_atomicrmw: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 -; GFX1250-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_system_seq_cst_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in seq_cst @@ -2759,18 +2754,19 @@ define amdgpu_kernel void @flat_system_acquire_ret_atomicrmw( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_system_acquire_ret_atomicrmw: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_system_acquire_ret_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in acquire @@ -3015,25 +3011,22 @@ define amdgpu_kernel void @flat_system_acq_rel_ret_atomicrmw( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_system_acq_rel_ret_atomicrmw: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_system_acq_rel_ret_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in acq_rel @@ -3278,25 +3271,22 @@ define amdgpu_kernel void @flat_system_seq_cst_ret_atomicrmw( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_system_seq_cst_ret_atomicrmw: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_system_seq_cst_ret_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in seq_cst @@ -3536,19 +3526,20 @@ define amdgpu_kernel void @flat_system_monotonic_monotonic_cmpxchg( ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_system_monotonic_monotonic_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_system_monotonic_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -3821,21 +3812,22 @@ define amdgpu_kernel void @flat_system_acquire_monotonic_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_system_acquire_monotonic_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 -; GFX1250-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_system_acquire_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -4103,24 +4095,23 @@ define amdgpu_kernel void @flat_system_release_monotonic_cmpxchg( ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_system_release_monotonic_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_system_release_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -4421,26 +4412,25 @@ define amdgpu_kernel void @flat_system_acq_rel_monotonic_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_system_acq_rel_monotonic_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 -; GFX1250-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_system_acq_rel_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -4741,26 +4731,25 @@ define amdgpu_kernel void @flat_system_seq_cst_monotonic_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_system_seq_cst_monotonic_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 -; GFX1250-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_system_seq_cst_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -5033,21 +5022,22 @@ define amdgpu_kernel void @flat_system_monotonic_acquire_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_system_monotonic_acquire_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 -; GFX1250-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_system_monotonic_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -5320,21 +5310,22 @@ define amdgpu_kernel void @flat_system_acquire_acquire_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_system_acquire_acquire_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 -; GFX1250-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_system_acquire_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -5635,26 +5626,25 @@ define amdgpu_kernel void @flat_system_release_acquire_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_system_release_acquire_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 -; GFX1250-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_system_release_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -5955,26 +5945,25 @@ define amdgpu_kernel void @flat_system_acq_rel_acquire_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_system_acq_rel_acquire_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 -; GFX1250-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_system_acq_rel_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -6275,26 +6264,25 @@ define amdgpu_kernel void @flat_system_seq_cst_acquire_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_system_seq_cst_acquire_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 -; GFX1250-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_system_seq_cst_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -6595,26 +6583,25 @@ define amdgpu_kernel void @flat_system_monotonic_seq_cst_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_system_monotonic_seq_cst_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 -; GFX1250-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_system_monotonic_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -6915,26 +6902,25 @@ define amdgpu_kernel void @flat_system_acquire_seq_cst_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_system_acquire_seq_cst_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 -; GFX1250-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_system_acquire_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -7235,26 +7221,25 @@ define amdgpu_kernel void @flat_system_release_seq_cst_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_system_release_seq_cst_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 -; GFX1250-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_system_release_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -7555,26 +7540,25 @@ define amdgpu_kernel void @flat_system_acq_rel_seq_cst_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_system_acq_rel_seq_cst_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 -; GFX1250-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_system_acq_rel_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -7875,26 +7859,25 @@ define amdgpu_kernel void @flat_system_seq_cst_seq_cst_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_system_seq_cst_seq_cst_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 -; GFX1250-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_system_seq_cst_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -8178,21 +8161,22 @@ define amdgpu_kernel void @flat_system_monotonic_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_system_monotonic_monotonic_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_system_monotonic_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -8495,22 +8479,23 @@ define amdgpu_kernel void @flat_system_acquire_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_system_acquire_monotonic_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_system_acquire_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -8824,26 +8809,25 @@ define amdgpu_kernel void @flat_system_release_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_system_release_monotonic_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_system_release_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -9178,29 +9162,26 @@ define amdgpu_kernel void @flat_system_acq_rel_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_system_acq_rel_monotonic_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_system_acq_rel_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -9535,29 +9516,26 @@ define amdgpu_kernel void @flat_system_seq_cst_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_system_seq_cst_monotonic_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_system_seq_cst_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -9864,24 +9842,23 @@ define amdgpu_kernel void @flat_system_monotonic_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_system_monotonic_acquire_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_system_monotonic_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -10184,22 +10161,23 @@ define amdgpu_kernel void @flat_system_acquire_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_system_acquire_acquire_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_system_acquire_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -10534,29 +10512,26 @@ define amdgpu_kernel void @flat_system_release_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_system_release_acquire_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_system_release_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -10891,29 +10866,26 @@ define amdgpu_kernel void @flat_system_acq_rel_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_system_acq_rel_acquire_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_system_acq_rel_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -11248,29 +11220,26 @@ define amdgpu_kernel void @flat_system_seq_cst_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_system_seq_cst_acquire_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_system_seq_cst_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -11605,29 +11574,26 @@ define amdgpu_kernel void @flat_system_monotonic_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_system_monotonic_seq_cst_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_system_monotonic_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -11958,27 +11924,26 @@ define amdgpu_kernel void @flat_system_acquire_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_system_acquire_seq_cst_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_system_acquire_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -12313,29 +12278,26 @@ define amdgpu_kernel void @flat_system_release_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_system_release_seq_cst_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_system_release_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -12670,29 +12632,26 @@ define amdgpu_kernel void @flat_system_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_system_acq_rel_seq_cst_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_system_acq_rel_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -13027,29 +12986,26 @@ define amdgpu_kernel void @flat_system_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_system_seq_cst_seq_cst_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_system_seq_cst_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -13896,24 +13852,20 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_load( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_system_one_as_seq_cst_load: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: flat_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_dscnt 0x0 -; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_system_one_as_seq_cst_load: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: flat_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %in, ptr %out) { entry: %val = load atomic i32, ptr %in syncscope("one-as") seq_cst, align 4 @@ -14065,15 +14017,16 @@ define amdgpu_kernel void @flat_system_one_as_unordered_store( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_system_one_as_unordered_store: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_system_one_as_unordered_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr %out) { entry: store atomic i32 %in, ptr %out syncscope("one-as") unordered, align 4 @@ -14224,15 +14177,16 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_store( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_system_one_as_monotonic_store: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_system_one_as_monotonic_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr %out) { entry: store atomic i32 %in, ptr %out syncscope("one-as") monotonic, align 4 @@ -14411,20 +14365,19 @@ define amdgpu_kernel void @flat_system_one_as_release_store( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_system_one_as_release_store: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_system_one_as_release_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr %out) { entry: store atomic i32 %in, ptr %out syncscope("one-as") release, align 4 @@ -14603,20 +14556,19 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_store( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_system_one_as_seq_cst_store: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_system_one_as_seq_cst_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr %out) { entry: store atomic i32 %in, ptr %out syncscope("one-as") seq_cst, align 4 @@ -14767,15 +14719,16 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_atomicrmw( ; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_system_one_as_monotonic_atomicrmw: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_system_one_as_monotonic_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("one-as") monotonic @@ -14953,19 +14906,20 @@ define amdgpu_kernel void @flat_system_one_as_acquire_atomicrmw( ; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS -; GFX12-CU-NEXT: s_endpgm -; -; GFX1250-LABEL: flat_system_one_as_acquire_atomicrmw: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-NEXT: s_endpgm +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_system_one_as_acquire_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("one-as") acquire @@ -15144,20 +15098,19 @@ define amdgpu_kernel void @flat_system_one_as_release_atomicrmw( ; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_system_one_as_release_atomicrmw: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_system_one_as_release_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("one-as") release @@ -15365,22 +15318,21 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_atomicrmw( ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_system_one_as_acq_rel_atomicrmw: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_system_one_as_acq_rel_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("one-as") acq_rel @@ -15588,22 +15540,21 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_atomicrmw( ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_system_one_as_seq_cst_atomicrmw: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_system_one_as_seq_cst_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("one-as") seq_cst @@ -15825,19 +15776,20 @@ define amdgpu_kernel void @flat_system_one_as_acquire_ret_atomicrmw( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_system_one_as_acquire_ret_atomicrmw: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_dscnt 0x0 -; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_system_one_as_acquire_ret_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("one-as") acquire @@ -16092,26 +16044,23 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_ret_atomicrmw( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_system_one_as_acq_rel_ret_atomicrmw: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_dscnt 0x0 -; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_system_one_as_acq_rel_ret_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("one-as") acq_rel @@ -16366,26 +16315,23 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_ret_atomicrmw( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_system_one_as_seq_cst_ret_atomicrmw: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_dscnt 0x0 -; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_system_one_as_seq_cst_ret_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("one-as") seq_cst @@ -16625,19 +16571,20 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_monotonic_cmpxchg( ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_system_one_as_monotonic_monotonic_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_system_one_as_monotonic_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -16906,21 +16853,22 @@ define amdgpu_kernel void @flat_system_one_as_acquire_monotonic_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_system_one_as_acquire_monotonic_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_system_one_as_acquire_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -17188,24 +17136,23 @@ define amdgpu_kernel void @flat_system_one_as_release_monotonic_cmpxchg( ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_system_one_as_release_monotonic_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_system_one_as_release_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -17502,26 +17449,25 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_system_one_as_acq_rel_monotonic_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_system_one_as_acq_rel_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -17818,26 +17764,25 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_system_one_as_seq_cst_monotonic_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_system_one_as_seq_cst_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -18106,21 +18051,22 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_acquire_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_system_one_as_monotonic_acquire_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_system_one_as_monotonic_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -18389,21 +18335,22 @@ define amdgpu_kernel void @flat_system_one_as_acquire_acquire_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_system_one_as_acquire_acquire_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_system_one_as_acquire_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -18700,26 +18647,25 @@ define amdgpu_kernel void @flat_system_one_as_release_acquire_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_system_one_as_release_acquire_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_system_one_as_release_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -19016,26 +18962,25 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_system_one_as_acq_rel_acquire_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_system_one_as_acq_rel_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -19332,26 +19277,25 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_system_one_as_seq_cst_acquire_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_system_one_as_seq_cst_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -19648,26 +19592,25 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_seq_cst_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_system_one_as_monotonic_seq_cst_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_system_one_as_monotonic_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -19964,26 +19907,25 @@ define amdgpu_kernel void @flat_system_one_as_acquire_seq_cst_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_system_one_as_acquire_seq_cst_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_system_one_as_acquire_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -20280,26 +20222,25 @@ define amdgpu_kernel void @flat_system_one_as_release_seq_cst_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_system_one_as_release_seq_cst_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_system_one_as_release_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -20596,26 +20537,25 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_system_one_as_acq_rel_seq_cst_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_system_one_as_acq_rel_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -20912,26 +20852,25 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_system_one_as_seq_cst_seq_cst_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_system_one_as_seq_cst_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -21215,21 +21154,22 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_system_one_as_monotonic_monotonic_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_system_one_as_monotonic_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -21542,23 +21482,24 @@ define amdgpu_kernel void @flat_system_one_as_acquire_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_system_one_as_acquire_monotonic_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_dscnt 0x0 -; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_system_one_as_acquire_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -21872,26 +21813,25 @@ define amdgpu_kernel void @flat_system_one_as_release_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_system_one_as_release_monotonic_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_system_one_as_release_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -22236,30 +22176,27 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_system_one_as_acq_rel_monotonic_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_dscnt 0x0 -; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_system_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -22604,30 +22541,27 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_system_one_as_seq_cst_monotonic_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_dscnt 0x0 -; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_system_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -22944,25 +22878,24 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_system_one_as_monotonic_acquire_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_dscnt 0x0 -; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_system_one_as_monotonic_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -23275,23 +23208,24 @@ define amdgpu_kernel void @flat_system_one_as_acquire_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_system_one_as_acquire_acquire_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_dscnt 0x0 -; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_system_one_as_acquire_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -23636,30 +23570,27 @@ define amdgpu_kernel void @flat_system_one_as_release_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_system_one_as_release_acquire_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_dscnt 0x0 -; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_system_one_as_release_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -24004,30 +23935,27 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_system_one_as_acq_rel_acquire_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_dscnt 0x0 -; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_system_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -24372,30 +24300,27 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_system_one_as_seq_cst_acquire_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_dscnt 0x0 -; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_system_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -24740,30 +24665,27 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_system_one_as_monotonic_seq_cst_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_dscnt 0x0 -; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_system_one_as_monotonic_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -25104,28 +25026,27 @@ define amdgpu_kernel void @flat_system_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_system_one_as_acquire_seq_cst_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_dscnt 0x0 -; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_system_one_as_acquire_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -25470,30 +25391,27 @@ define amdgpu_kernel void @flat_system_one_as_release_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_system_one_as_release_seq_cst_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_dscnt 0x0 -; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_system_one_as_release_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -25838,30 +25756,27 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_dscnt 0x0 -; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -26206,30 +26121,27 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_dscnt 0x0 -; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-volatile.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-volatile.ll index 77f52e4d4b9fd..68af003ba6353 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-volatile.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-volatile.ll @@ -145,18 +145,16 @@ define amdgpu_kernel void @flat_nontemporal_load_0( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_nontemporal_load_0: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: flat_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_nontemporal_load_0: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: flat_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %in, ptr %out) { entry: %val = load volatile i32, ptr %in, align 4 @@ -422,22 +420,20 @@ define amdgpu_kernel void @flat_nontemporal_load_1( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_nontemporal_load_1: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v1, v0 -; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_wait_xcnt 0x0 -; GFX1250-NEXT: s_mov_b32 s4, 0x3ff -; GFX1250-NEXT: v_and_b32_e64 v1, v1, s4 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: flat_load_b32 v1, v1, s[2:3] scale_offset scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_nontemporal_load_1: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, v0 +; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_mov_b32 s4, 0x3ff +; GFX1250-CU-NEXT: v_and_b32_e64 v1, v1, s4 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: flat_load_b32 v1, v1, s[2:3] scale_offset scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %in, ptr %out) { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -1144,16 +1140,18 @@ define amdgpu_kernel void @flat_volatile_workgroup_release_store( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_volatile_workgroup_release_store: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: s_wait_dscnt 0x0 -; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_volatile_workgroup_release_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr %out) { entry: store atomic volatile i32 %in, ptr %out syncscope("workgroup") release, align 4 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-wavefront.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-wavefront.ll index f086542b3d1f8..a4804675fd3cf 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-wavefront.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-wavefront.ll @@ -929,15 +929,16 @@ define amdgpu_kernel void @flat_wavefront_unordered_store( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_wavefront_unordered_store: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_wavefront_unordered_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr %out) { entry: store atomic i32 %in, ptr %out syncscope("wavefront") unordered, align 4 @@ -1088,15 +1089,16 @@ define amdgpu_kernel void @flat_wavefront_monotonic_store( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_wavefront_monotonic_store: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_wavefront_monotonic_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr %out) { entry: store atomic i32 %in, ptr %out syncscope("wavefront") monotonic, align 4 @@ -1247,15 +1249,16 @@ define amdgpu_kernel void @flat_wavefront_release_store( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_wavefront_release_store: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_wavefront_release_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr %out) { entry: store atomic i32 %in, ptr %out syncscope("wavefront") release, align 4 @@ -1406,15 +1409,16 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_store( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_wavefront_seq_cst_store: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_wavefront_seq_cst_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr %out) { entry: store atomic i32 %in, ptr %out syncscope("wavefront") seq_cst, align 4 @@ -1565,15 +1569,16 @@ define amdgpu_kernel void @flat_wavefront_monotonic_atomicrmw( ; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_wavefront_monotonic_atomicrmw: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_wavefront_monotonic_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("wavefront") monotonic @@ -1724,15 +1729,16 @@ define amdgpu_kernel void @flat_wavefront_acquire_atomicrmw( ; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_wavefront_acquire_atomicrmw: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_wavefront_acquire_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("wavefront") acquire @@ -1883,15 +1889,16 @@ define amdgpu_kernel void @flat_wavefront_release_atomicrmw( ; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_wavefront_release_atomicrmw: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_wavefront_release_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("wavefront") release @@ -2042,15 +2049,16 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_atomicrmw( ; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_wavefront_acq_rel_atomicrmw: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_wavefront_acq_rel_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("wavefront") acq_rel @@ -2201,15 +2209,16 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_atomicrmw( ; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_wavefront_seq_cst_atomicrmw: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_wavefront_seq_cst_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("wavefront") seq_cst @@ -2404,17 +2413,18 @@ define amdgpu_kernel void @flat_wavefront_acquire_ret_atomicrmw( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_wavefront_acquire_ret_atomicrmw: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_wavefront_acquire_ret_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("wavefront") acquire @@ -2610,17 +2620,18 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_ret_atomicrmw( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_wavefront_acq_rel_ret_atomicrmw: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_wavefront_acq_rel_ret_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("wavefront") acq_rel @@ -2816,17 +2827,18 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_ret_atomicrmw( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_wavefront_seq_cst_ret_atomicrmw: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_wavefront_seq_cst_ret_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("wavefront") seq_cst @@ -3066,19 +3078,20 @@ define amdgpu_kernel void @flat_wavefront_monotonic_monotonic_cmpxchg( ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_wavefront_monotonic_monotonic_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_wavefront_monotonic_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -3318,19 +3331,20 @@ define amdgpu_kernel void @flat_wavefront_acquire_monotonic_cmpxchg( ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_wavefront_acquire_monotonic_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_wavefront_acquire_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -3570,19 +3584,20 @@ define amdgpu_kernel void @flat_wavefront_release_monotonic_cmpxchg( ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_wavefront_release_monotonic_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_wavefront_release_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -3822,19 +3837,20 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_monotonic_cmpxchg( ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_wavefront_acq_rel_monotonic_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_wavefront_acq_rel_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -4074,19 +4090,20 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_monotonic_cmpxchg( ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_wavefront_seq_cst_monotonic_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_wavefront_seq_cst_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -4326,19 +4343,20 @@ define amdgpu_kernel void @flat_wavefront_monotonic_acquire_cmpxchg( ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_wavefront_monotonic_acquire_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_wavefront_monotonic_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -4578,19 +4596,20 @@ define amdgpu_kernel void @flat_wavefront_acquire_acquire_cmpxchg( ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_wavefront_acquire_acquire_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_wavefront_acquire_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -4830,19 +4849,20 @@ define amdgpu_kernel void @flat_wavefront_release_acquire_cmpxchg( ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_wavefront_release_acquire_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_wavefront_release_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -5082,19 +5102,20 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_acquire_cmpxchg( ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_wavefront_acq_rel_acquire_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_wavefront_acq_rel_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -5334,19 +5355,20 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_acquire_cmpxchg( ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_wavefront_seq_cst_acquire_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_wavefront_seq_cst_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -5586,19 +5608,20 @@ define amdgpu_kernel void @flat_wavefront_monotonic_seq_cst_cmpxchg( ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_wavefront_monotonic_seq_cst_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_wavefront_monotonic_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -5838,19 +5861,20 @@ define amdgpu_kernel void @flat_wavefront_acquire_seq_cst_cmpxchg( ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_wavefront_acquire_seq_cst_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_wavefront_acquire_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -6090,19 +6114,20 @@ define amdgpu_kernel void @flat_wavefront_release_seq_cst_cmpxchg( ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_wavefront_release_seq_cst_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_wavefront_release_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -6342,19 +6367,20 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_seq_cst_cmpxchg( ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_wavefront_acq_rel_seq_cst_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_wavefront_acq_rel_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -6594,19 +6620,20 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_seq_cst_cmpxchg( ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_wavefront_seq_cst_seq_cst_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_wavefront_seq_cst_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -6890,21 +6917,22 @@ define amdgpu_kernel void @flat_wavefront_monotonic_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_wavefront_monotonic_monotonic_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_wavefront_monotonic_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -7190,21 +7218,22 @@ define amdgpu_kernel void @flat_wavefront_acquire_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_wavefront_acquire_monotonic_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_wavefront_acquire_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -7490,21 +7519,22 @@ define amdgpu_kernel void @flat_wavefront_release_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_wavefront_release_monotonic_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_wavefront_release_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -7790,21 +7820,22 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_wavefront_acq_rel_monotonic_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_wavefront_acq_rel_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -8090,21 +8121,22 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_wavefront_seq_cst_monotonic_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_wavefront_seq_cst_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -8390,21 +8422,22 @@ define amdgpu_kernel void @flat_wavefront_monotonic_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_wavefront_monotonic_acquire_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_wavefront_monotonic_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -8690,21 +8723,22 @@ define amdgpu_kernel void @flat_wavefront_acquire_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_wavefront_acquire_acquire_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_wavefront_acquire_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -8990,21 +9024,22 @@ define amdgpu_kernel void @flat_wavefront_release_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_wavefront_release_acquire_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_wavefront_release_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -9290,21 +9325,22 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_wavefront_acq_rel_acquire_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_wavefront_acq_rel_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -9590,21 +9626,22 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_wavefront_seq_cst_acquire_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_wavefront_seq_cst_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -9890,21 +9927,22 @@ define amdgpu_kernel void @flat_wavefront_monotonic_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_wavefront_monotonic_seq_cst_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_wavefront_monotonic_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -10190,21 +10228,22 @@ define amdgpu_kernel void @flat_wavefront_acquire_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_wavefront_acquire_seq_cst_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_wavefront_acquire_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -10490,21 +10529,22 @@ define amdgpu_kernel void @flat_wavefront_release_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_wavefront_release_seq_cst_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_wavefront_release_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -10790,21 +10830,22 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_wavefront_acq_rel_seq_cst_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_wavefront_acq_rel_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -11090,21 +11131,22 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_wavefront_seq_cst_seq_cst_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_wavefront_seq_cst_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -12030,15 +12072,16 @@ define amdgpu_kernel void @flat_wavefront_one_as_unordered_store( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_wavefront_one_as_unordered_store: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_wavefront_one_as_unordered_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr %out) { entry: store atomic i32 %in, ptr %out syncscope("wavefront-one-as") unordered, align 4 @@ -12189,15 +12232,16 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_store( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_wavefront_one_as_monotonic_store: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_wavefront_one_as_monotonic_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr %out) { entry: store atomic i32 %in, ptr %out syncscope("wavefront-one-as") monotonic, align 4 @@ -12348,15 +12392,16 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_store( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_wavefront_one_as_release_store: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_wavefront_one_as_release_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr %out) { entry: store atomic i32 %in, ptr %out syncscope("wavefront-one-as") release, align 4 @@ -12507,15 +12552,16 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_store( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_wavefront_one_as_seq_cst_store: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_wavefront_one_as_seq_cst_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr %out) { entry: store atomic i32 %in, ptr %out syncscope("wavefront-one-as") seq_cst, align 4 @@ -12666,15 +12712,16 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_atomicrmw( ; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_wavefront_one_as_monotonic_atomicrmw: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_wavefront_one_as_monotonic_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("wavefront-one-as") monotonic @@ -12825,15 +12872,16 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_atomicrmw( ; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_wavefront_one_as_acquire_atomicrmw: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_wavefront_one_as_acquire_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("wavefront-one-as") acquire @@ -12984,15 +13032,16 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_atomicrmw( ; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_wavefront_one_as_release_atomicrmw: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_wavefront_one_as_release_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("wavefront-one-as") release @@ -13143,15 +13192,16 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_atomicrmw( ; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_wavefront_one_as_acq_rel_atomicrmw: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_wavefront_one_as_acq_rel_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("wavefront-one-as") acq_rel @@ -13302,15 +13352,16 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_atomicrmw( ; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_wavefront_one_as_seq_cst_atomicrmw: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_wavefront_one_as_seq_cst_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("wavefront-one-as") seq_cst @@ -13505,17 +13556,18 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_ret_atomicrmw( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_wavefront_one_as_acquire_ret_atomicrmw: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_wavefront_one_as_acquire_ret_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("wavefront-one-as") acquire @@ -13711,17 +13763,18 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_ret_atomicrmw( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_wavefront_one_as_acq_rel_ret_atomicrmw: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_wavefront_one_as_acq_rel_ret_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("wavefront-one-as") acq_rel @@ -13917,17 +13970,18 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_ret_atomicrmw( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_wavefront_one_as_seq_cst_ret_atomicrmw: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_wavefront_one_as_seq_cst_ret_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("wavefront-one-as") seq_cst @@ -14167,19 +14221,20 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_monotonic_cmpxchg( ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_wavefront_one_as_monotonic_monotonic_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_wavefront_one_as_monotonic_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -14419,19 +14474,20 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_monotonic_cmpxchg( ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_wavefront_one_as_acquire_monotonic_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_wavefront_one_as_acquire_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -14671,19 +14727,20 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_monotonic_cmpxchg( ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_wavefront_one_as_release_monotonic_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_wavefront_one_as_release_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -14923,19 +14980,20 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_monotonic_cmpxchg( ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_wavefront_one_as_acq_rel_monotonic_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_wavefront_one_as_acq_rel_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -15175,19 +15233,20 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_monotonic_cmpxchg( ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_wavefront_one_as_seq_cst_monotonic_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_wavefront_one_as_seq_cst_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -15427,19 +15486,20 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_acquire_cmpxchg( ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_wavefront_one_as_monotonic_acquire_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_wavefront_one_as_monotonic_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -15679,19 +15739,20 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_acquire_cmpxchg( ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_wavefront_one_as_acquire_acquire_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_wavefront_one_as_acquire_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -15931,19 +15992,20 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_acquire_cmpxchg( ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_wavefront_one_as_release_acquire_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_wavefront_one_as_release_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -16183,19 +16245,20 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_acquire_cmpxchg( ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_wavefront_one_as_acq_rel_acquire_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_wavefront_one_as_acq_rel_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -16435,19 +16498,20 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_acquire_cmpxchg( ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_wavefront_one_as_seq_cst_acquire_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_wavefront_one_as_seq_cst_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -16687,19 +16751,20 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_seq_cst_cmpxchg( ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_wavefront_one_as_monotonic_seq_cst_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_wavefront_one_as_monotonic_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -16939,19 +17004,20 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_seq_cst_cmpxchg( ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_wavefront_one_as_acquire_seq_cst_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_wavefront_one_as_acquire_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -17191,19 +17257,20 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_seq_cst_cmpxchg( ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_wavefront_one_as_release_seq_cst_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_wavefront_one_as_release_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -17443,19 +17510,20 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg( ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -17695,19 +17763,20 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg( ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -17991,21 +18060,22 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_monotonic_ret_cmpxchg ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_wavefront_one_as_monotonic_monotonic_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_wavefront_one_as_monotonic_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -18291,21 +18361,22 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -18591,21 +18662,22 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -18891,21 +18963,22 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -19191,21 +19264,22 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_wavefront_one_as_monotonic_acquire_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_wavefront_one_as_monotonic_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -19491,21 +19565,22 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_wavefront_one_as_acquire_acquire_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_wavefront_one_as_acquire_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -19791,21 +19866,22 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_wavefront_one_as_release_acquire_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_wavefront_one_as_release_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -20091,21 +20167,22 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -20391,21 +20468,22 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -20691,21 +20769,22 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -20991,21 +21070,22 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_wavefront_one_as_acquire_seq_cst_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_wavefront_one_as_acquire_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -21291,21 +21371,22 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_wavefront_one_as_release_seq_cst_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_wavefront_one_as_release_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -21591,21 +21672,22 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_relc_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_wavefront_one_as_acq_relc_seq_cst_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_wavefront_one_as_acq_relc_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -21891,21 +21973,22 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-workgroup.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-workgroup.ll index d8e6ad043e061..01801637ce770 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-workgroup.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-workgroup.ll @@ -811,17 +811,18 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_load( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_workgroup_seq_cst_load: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-NEXT: s_wait_dscnt 0x0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: flat_load_b32 v1, v0, s[2:3] -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_workgroup_seq_cst_load: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: flat_load_b32 v1, v0, s[2:3] +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %in, ptr %out) { entry: %val = load atomic i32, ptr %in syncscope("workgroup") seq_cst, align 4 @@ -973,15 +974,16 @@ define amdgpu_kernel void @flat_workgroup_unordered_store( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_workgroup_unordered_store: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_workgroup_unordered_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr %out) { entry: store atomic i32 %in, ptr %out syncscope("workgroup") unordered, align 4 @@ -1132,15 +1134,16 @@ define amdgpu_kernel void @flat_workgroup_monotonic_store( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_workgroup_monotonic_store: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_workgroup_monotonic_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr %out) { entry: store atomic i32 %in, ptr %out syncscope("workgroup") monotonic, align 4 @@ -1308,16 +1311,18 @@ define amdgpu_kernel void @flat_workgroup_release_store( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_workgroup_release_store: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: s_wait_dscnt 0x0 -; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_workgroup_release_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr %out) { entry: store atomic i32 %in, ptr %out syncscope("workgroup") release, align 4 @@ -1485,16 +1490,18 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_store( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_workgroup_seq_cst_store: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: s_wait_dscnt 0x0 -; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_workgroup_seq_cst_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr %out) { entry: store atomic i32 %in, ptr %out syncscope("workgroup") seq_cst, align 4 @@ -1645,15 +1652,16 @@ define amdgpu_kernel void @flat_workgroup_monotonic_atomicrmw( ; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_workgroup_monotonic_atomicrmw: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_workgroup_monotonic_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("workgroup") monotonic @@ -1823,16 +1831,17 @@ define amdgpu_kernel void @flat_workgroup_acquire_atomicrmw( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_workgroup_acquire_atomicrmw: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] -; GFX1250-NEXT: s_wait_dscnt 0x0 -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_workgroup_acquire_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("workgroup") acquire @@ -2000,16 +2009,18 @@ define amdgpu_kernel void @flat_workgroup_release_atomicrmw( ; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_workgroup_release_atomicrmw: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: s_wait_dscnt 0x0 -; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_workgroup_release_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("workgroup") release @@ -2196,17 +2207,19 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_atomicrmw( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_workgroup_acq_rel_atomicrmw: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: s_wait_dscnt 0x0 -; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] -; GFX1250-NEXT: s_wait_dscnt 0x0 -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_workgroup_acq_rel_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("workgroup") acq_rel @@ -2393,17 +2406,19 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_atomicrmw( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_workgroup_seq_cst_atomicrmw: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: s_wait_dscnt 0x0 -; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] -; GFX1250-NEXT: s_wait_dscnt 0x0 -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_workgroup_seq_cst_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("workgroup") seq_cst @@ -2610,17 +2625,18 @@ define amdgpu_kernel void @flat_workgroup_acquire_ret_atomicrmw( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_workgroup_acquire_ret_atomicrmw: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_workgroup_acquire_ret_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("workgroup") acquire @@ -2847,18 +2863,20 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_ret_atomicrmw( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_workgroup_acq_rel_ret_atomicrmw: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: s_wait_dscnt 0x0 -; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_workgroup_acq_rel_ret_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("workgroup") acq_rel @@ -3085,18 +3103,20 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_ret_atomicrmw( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_workgroup_seq_cst_ret_atomicrmw: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: s_wait_dscnt 0x0 -; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_workgroup_seq_cst_ret_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("workgroup") seq_cst @@ -3336,19 +3356,20 @@ define amdgpu_kernel void @flat_workgroup_monotonic_monotonic_cmpxchg( ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_workgroup_monotonic_monotonic_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_workgroup_monotonic_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -3607,20 +3628,21 @@ define amdgpu_kernel void @flat_workgroup_acquire_monotonic_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_workgroup_acquire_monotonic_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-NEXT: s_wait_dscnt 0x0 -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_workgroup_acquire_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -3877,20 +3899,22 @@ define amdgpu_kernel void @flat_workgroup_release_monotonic_cmpxchg( ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_workgroup_release_monotonic_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: s_wait_dscnt 0x0 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_workgroup_release_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -4166,21 +4190,23 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_workgroup_acq_rel_monotonic_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: s_wait_dscnt 0x0 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-NEXT: s_wait_dscnt 0x0 -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_workgroup_acq_rel_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -4456,21 +4482,23 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_workgroup_seq_cst_monotonic_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: s_wait_dscnt 0x0 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-NEXT: s_wait_dscnt 0x0 -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_workgroup_seq_cst_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -4729,20 +4757,21 @@ define amdgpu_kernel void @flat_workgroup_monotonic_acquire_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_workgroup_monotonic_acquire_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-NEXT: s_wait_dscnt 0x0 -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_workgroup_monotonic_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -5001,20 +5030,21 @@ define amdgpu_kernel void @flat_workgroup_acquire_acquire_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_workgroup_acquire_acquire_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-NEXT: s_wait_dscnt 0x0 -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_workgroup_acquire_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -5290,21 +5320,23 @@ define amdgpu_kernel void @flat_workgroup_release_acquire_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_workgroup_release_acquire_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: s_wait_dscnt 0x0 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-NEXT: s_wait_dscnt 0x0 -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_workgroup_release_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -5580,21 +5612,23 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_workgroup_acq_rel_acquire_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: s_wait_dscnt 0x0 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-NEXT: s_wait_dscnt 0x0 -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_workgroup_acq_rel_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -5870,21 +5904,23 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_workgroup_seq_cst_acquire_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: s_wait_dscnt 0x0 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-NEXT: s_wait_dscnt 0x0 -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_workgroup_seq_cst_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -6160,21 +6196,23 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_workgroup_seq_cst_seq_cst_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: s_wait_dscnt 0x0 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-NEXT: s_wait_dscnt 0x0 -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_workgroup_seq_cst_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -6458,21 +6496,22 @@ define amdgpu_kernel void @flat_workgroup_monotonic_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_workgroup_monotonic_monotonic_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_workgroup_monotonic_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -6770,21 +6809,22 @@ define amdgpu_kernel void @flat_workgroup_acquire_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_workgroup_acquire_monotonic_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_workgroup_acquire_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -7087,22 +7127,24 @@ define amdgpu_kernel void @flat_workgroup_release_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_workgroup_release_monotonic_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: s_wait_dscnt 0x0 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_workgroup_release_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -7419,22 +7461,24 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_workgroup_acq_rel_monotonic_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: s_wait_dscnt 0x0 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_workgroup_acq_rel_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -7751,22 +7795,24 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_workgroup_seq_cst_monotonic_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: s_wait_dscnt 0x0 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_workgroup_seq_cst_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -8066,21 +8112,22 @@ define amdgpu_kernel void @flat_workgroup_monotonic_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_workgroup_monotonic_acquire_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_workgroup_monotonic_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -8378,21 +8425,22 @@ define amdgpu_kernel void @flat_workgroup_acquire_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_workgroup_acquire_acquire_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_workgroup_acquire_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -8709,22 +8757,24 @@ define amdgpu_kernel void @flat_workgroup_release_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_workgroup_release_acquire_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: s_wait_dscnt 0x0 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_workgroup_release_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -9041,22 +9091,24 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_workgroup_acq_rel_acquire_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: s_wait_dscnt 0x0 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_workgroup_acq_rel_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -9373,22 +9425,24 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_workgroup_seq_cst_acquire_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: s_wait_dscnt 0x0 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_workgroup_seq_cst_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -9705,22 +9759,24 @@ define amdgpu_kernel void @flat_workgroup_monotonic_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_workgroup_monotonic_seq_cst_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: s_wait_dscnt 0x0 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_workgroup_monotonic_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -10035,22 +10091,24 @@ define amdgpu_kernel void @flat_workgroup_acquire_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_workgroup_acquire_seq_cst_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: s_wait_dscnt 0x0 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_workgroup_acquire_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -10367,22 +10425,24 @@ define amdgpu_kernel void @flat_workgroup_release_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_workgroup_release_seq_cst_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: s_wait_dscnt 0x0 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_workgroup_release_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -10699,22 +10759,24 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_workgroup_acq_rel_seq_cst_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: s_wait_dscnt 0x0 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_workgroup_acq_rel_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -11031,22 +11093,24 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_workgroup_seq_cst_seq_cst_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: s_wait_dscnt 0x0 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_workgroup_seq_cst_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -11839,16 +11903,18 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_load( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_workgroup_one_as_seq_cst_load: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: flat_load_b32 v1, v0, s[2:3] -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_workgroup_one_as_seq_cst_load: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: flat_load_b32 v1, v0, s[2:3] +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %in, ptr %out) { entry: %val = load atomic i32, ptr %in syncscope("workgroup-one-as") seq_cst, align 4 @@ -12000,15 +12066,16 @@ define amdgpu_kernel void @flat_workgroup_one_as_unordered_store( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_workgroup_one_as_unordered_store: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_workgroup_one_as_unordered_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr %out) { entry: store atomic i32 %in, ptr %out syncscope("workgroup-one-as") unordered, align 4 @@ -12159,15 +12226,16 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_store( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_workgroup_one_as_monotonic_store: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_workgroup_one_as_monotonic_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr %out) { entry: store atomic i32 %in, ptr %out syncscope("workgroup-one-as") monotonic, align 4 @@ -12328,15 +12396,18 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_store( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_workgroup_one_as_release_store: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_workgroup_one_as_release_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr %out) { entry: store atomic i32 %in, ptr %out syncscope("workgroup-one-as") release, align 4 @@ -12497,15 +12568,18 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_store( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_workgroup_one_as_seq_cst_store: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_workgroup_one_as_seq_cst_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr %out) { entry: store atomic i32 %in, ptr %out syncscope("workgroup-one-as") seq_cst, align 4 @@ -12656,15 +12730,16 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_atomicrmw( ; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_workgroup_one_as_monotonic_atomicrmw: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_workgroup_one_as_monotonic_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("workgroup-one-as") monotonic @@ -12825,15 +12900,17 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_atomicrmw( ; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_workgroup_one_as_acquire_atomicrmw: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_workgroup_one_as_acquire_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("workgroup-one-as") acquire @@ -12994,15 +13071,18 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_atomicrmw( ; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_workgroup_one_as_release_atomicrmw: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_workgroup_one_as_release_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("workgroup-one-as") release @@ -13173,15 +13253,19 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_atomicrmw( ; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_workgroup_one_as_acq_rel_atomicrmw: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_workgroup_one_as_acq_rel_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("workgroup-one-as") acq_rel @@ -13352,15 +13436,19 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_atomicrmw( ; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_workgroup_one_as_seq_cst_atomicrmw: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_workgroup_one_as_seq_cst_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("workgroup-one-as") seq_cst @@ -13563,17 +13651,18 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_ret_atomicrmw( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_workgroup_one_as_acquire_ret_atomicrmw: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_workgroup_one_as_acquire_ret_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("workgroup-one-as") acquire @@ -13789,17 +13878,20 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_ret_atomicrmw( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_workgroup_one_as_acq_rel_ret_atomicrmw: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_workgroup_one_as_acq_rel_ret_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("workgroup-one-as") acq_rel @@ -14015,17 +14107,20 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_ret_atomicrmw( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_workgroup_one_as_seq_cst_ret_atomicrmw: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_workgroup_one_as_seq_cst_ret_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("workgroup-one-as") seq_cst @@ -14265,19 +14360,20 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_monotonic_cmpxchg( ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_workgroup_one_as_monotonic_monotonic_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_workgroup_one_as_monotonic_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -14527,19 +14623,21 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_cmpxchg( ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_workgroup_one_as_acquire_monotonic_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_workgroup_one_as_acquire_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -14789,19 +14887,22 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_cmpxchg( ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_workgroup_one_as_release_monotonic_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_workgroup_one_as_release_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -15061,19 +15162,23 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_cmpxchg( ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_workgroup_one_as_acq_rel_monotonic_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_workgroup_one_as_acq_rel_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -15333,19 +15438,23 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_cmpxchg( ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_workgroup_one_as_seq_cst_monotonic_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_workgroup_one_as_seq_cst_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -15595,19 +15704,21 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_acquire_cmpxchg( ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_workgroup_one_as_monotonic_acquire_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_workgroup_one_as_monotonic_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -15857,19 +15968,21 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_cmpxchg( ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_workgroup_one_as_acquire_acquire_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_workgroup_one_as_acquire_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -16129,19 +16242,23 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_cmpxchg( ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_workgroup_one_as_release_acquire_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_workgroup_one_as_release_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -16401,19 +16518,23 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_cmpxchg( ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_workgroup_one_as_acq_rel_acquire_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_workgroup_one_as_acq_rel_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -16673,19 +16794,23 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_cmpxchg( ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_workgroup_one_as_seq_cst_acquire_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_workgroup_one_as_seq_cst_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -16945,19 +17070,23 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_cmpxchg( ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_workgroup_one_as_monotonic_seq_cst_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_workgroup_one_as_monotonic_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -17217,19 +17346,23 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_cmpxchg( ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_workgroup_one_as_acquire_seq_cst_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_workgroup_one_as_acquire_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -17489,19 +17622,23 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_cmpxchg( ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_workgroup_one_as_release_seq_cst_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_workgroup_one_as_release_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -17761,19 +17898,23 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg( ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -18033,19 +18174,23 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg( ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -18329,21 +18474,22 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -18637,21 +18783,22 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -18947,21 +19094,24 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_workgroup_one_as_release_monotonic_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_workgroup_one_as_release_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -19267,21 +19417,24 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -19587,21 +19740,24 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -19897,21 +20053,22 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -20205,21 +20362,22 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_workgroup_one_as_acquire_acquire_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_workgroup_one_as_acquire_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -20525,21 +20683,24 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_workgroup_one_as_release_acquire_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_workgroup_one_as_release_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -20845,21 +21006,24 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -21165,21 +21329,24 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -21485,21 +21652,24 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -21803,21 +21973,24 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -22123,21 +22296,24 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_workgroup_one_as_release_seq_cst_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_workgroup_one_as_release_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -22443,21 +22619,24 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -22763,21 +22942,24 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-agent.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-agent.ll index 184e15406bfbc..ad163cefe57d4 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-agent.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-agent.ll @@ -829,23 +829,19 @@ define amdgpu_kernel void @global_agent_seq_cst_load( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_agent_seq_cst_load: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_agent_seq_cst_load: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { entry: %val = load atomic i32, ptr addrspace(1) %in syncscope("agent") seq_cst, align 4 @@ -1004,15 +1000,16 @@ define amdgpu_kernel void @global_agent_unordered_store( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_agent_unordered_store: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_agent_unordered_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { entry: store atomic i32 %in, ptr addrspace(1) %out syncscope("agent") unordered, align 4 @@ -1170,15 +1167,16 @@ define amdgpu_kernel void @global_agent_monotonic_store( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_agent_monotonic_store: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_agent_monotonic_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { entry: store atomic i32 %in, ptr addrspace(1) %out syncscope("agent") monotonic, align 4 @@ -1361,19 +1359,19 @@ define amdgpu_kernel void @global_agent_release_store( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_agent_release_store: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_agent_release_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { entry: store atomic i32 %in, ptr addrspace(1) %out syncscope("agent") release, align 4 @@ -1556,19 +1554,19 @@ define amdgpu_kernel void @global_agent_seq_cst_store( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_agent_seq_cst_store: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_agent_seq_cst_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { entry: store atomic i32 %in, ptr addrspace(1) %out syncscope("agent") seq_cst, align 4 @@ -1724,15 +1722,16 @@ define amdgpu_kernel void @global_agent_monotonic_atomicrmw( ; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_agent_monotonic_atomicrmw: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_agent_monotonic_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("agent") monotonic @@ -1917,17 +1916,18 @@ define amdgpu_kernel void @global_agent_acquire_atomicrmw( ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_agent_acquire_atomicrmw: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_agent_acquire_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("agent") acquire @@ -2108,19 +2108,19 @@ define amdgpu_kernel void @global_agent_release_atomicrmw( ; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_agent_release_atomicrmw: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_agent_release_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("agent") release @@ -2330,21 +2330,21 @@ define amdgpu_kernel void @global_agent_acq_rel_atomicrmw( ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_agent_acq_rel_atomicrmw: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_agent_acq_rel_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("agent") acq_rel @@ -2554,21 +2554,21 @@ define amdgpu_kernel void @global_agent_seq_cst_atomicrmw( ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_agent_seq_cst_atomicrmw: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_agent_seq_cst_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("agent") seq_cst @@ -2768,18 +2768,19 @@ define amdgpu_kernel void @global_agent_acquire_ret_atomicrmw( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_agent_acquire_ret_atomicrmw: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_agent_acquire_ret_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("agent") acquire @@ -3009,24 +3010,22 @@ define amdgpu_kernel void @global_agent_acq_rel_ret_atomicrmw( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_agent_acq_rel_ret_atomicrmw: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_agent_acq_rel_ret_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("agent") acq_rel @@ -3256,24 +3255,22 @@ define amdgpu_kernel void @global_agent_seq_cst_ret_atomicrmw( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_agent_seq_cst_ret_atomicrmw: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_agent_seq_cst_ret_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("agent") seq_cst @@ -3494,19 +3491,20 @@ define amdgpu_kernel void @global_agent_monotonic_monotonic_cmpxchg( ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_agent_monotonic_monotonic_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_agent_monotonic_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -3756,21 +3754,22 @@ define amdgpu_kernel void @global_agent_acquire_monotonic_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_agent_acquire_monotonic_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_agent_acquire_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -4016,23 +4015,23 @@ define amdgpu_kernel void @global_agent_release_monotonic_cmpxchg( ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_agent_release_monotonic_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_agent_release_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -4307,25 +4306,25 @@ define amdgpu_kernel void @global_agent_acq_rel_monotonic_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_agent_acq_rel_monotonic_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_agent_acq_rel_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -4600,25 +4599,25 @@ define amdgpu_kernel void @global_agent_seq_cst_monotonic_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_agent_seq_cst_monotonic_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_agent_seq_cst_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -4868,21 +4867,22 @@ define amdgpu_kernel void @global_agent_monotonic_acquire_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_agent_monotonic_acquire_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_agent_monotonic_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -5132,21 +5132,22 @@ define amdgpu_kernel void @global_agent_acquire_acquire_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_agent_acquire_acquire_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_agent_acquire_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -5421,25 +5422,25 @@ define amdgpu_kernel void @global_agent_release_acquire_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_agent_release_acquire_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_agent_release_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -5714,25 +5715,25 @@ define amdgpu_kernel void @global_agent_acq_rel_acquire_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_agent_acq_rel_acquire_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_agent_acq_rel_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -6007,25 +6008,25 @@ define amdgpu_kernel void @global_agent_seq_cst_acquire_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_agent_seq_cst_acquire_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_agent_seq_cst_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -6300,25 +6301,25 @@ define amdgpu_kernel void @global_agent_monotonic_seq_cst_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_agent_monotonic_seq_cst_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_agent_monotonic_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -6593,25 +6594,25 @@ define amdgpu_kernel void @global_agent_acquire_seq_cst_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_agent_acquire_seq_cst_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_agent_acquire_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -6886,25 +6887,25 @@ define amdgpu_kernel void @global_agent_release_seq_cst_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_agent_release_seq_cst_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_agent_release_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -7179,25 +7180,25 @@ define amdgpu_kernel void @global_agent_acq_rel_seq_cst_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_agent_acq_rel_seq_cst_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_agent_acq_rel_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -7472,25 +7473,25 @@ define amdgpu_kernel void @global_agent_seq_cst_seq_cst_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_agent_seq_cst_seq_cst_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_agent_seq_cst_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -7741,21 +7742,22 @@ define amdgpu_kernel void @global_agent_monotonic_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_agent_monotonic_monotonic_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_agent_monotonic_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -8024,22 +8026,23 @@ define amdgpu_kernel void @global_agent_acquire_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_agent_acquire_monotonic_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_agent_acquire_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -8317,25 +8320,25 @@ define amdgpu_kernel void @global_agent_release_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_agent_release_monotonic_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_agent_release_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -8633,28 +8636,26 @@ define amdgpu_kernel void @global_agent_acq_rel_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_agent_acq_rel_monotonic_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_agent_acq_rel_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -8952,28 +8953,26 @@ define amdgpu_kernel void @global_agent_seq_cst_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_agent_seq_cst_monotonic_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_agent_seq_cst_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -9246,24 +9245,23 @@ define amdgpu_kernel void @global_agent_monotonic_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_agent_monotonic_acquire_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_agent_monotonic_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -9532,22 +9530,23 @@ define amdgpu_kernel void @global_agent_acquire_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_agent_acquire_acquire_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_agent_acquire_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -9845,28 +9844,26 @@ define amdgpu_kernel void @global_agent_release_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_agent_release_acquire_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_agent_release_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -10164,28 +10161,26 @@ define amdgpu_kernel void @global_agent_acq_rel_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_agent_acq_rel_acquire_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_agent_acq_rel_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -10483,28 +10478,26 @@ define amdgpu_kernel void @global_agent_seq_cst_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_agent_seq_cst_acquire_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_agent_seq_cst_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -10802,28 +10795,26 @@ define amdgpu_kernel void @global_agent_monotonic_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_agent_monotonic_seq_cst_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_agent_monotonic_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -11117,26 +11108,26 @@ define amdgpu_kernel void @global_agent_acquire_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_agent_acquire_seq_cst_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_agent_acquire_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -11434,28 +11425,26 @@ define amdgpu_kernel void @global_agent_release_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_agent_release_seq_cst_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_agent_release_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -11753,28 +11742,26 @@ define amdgpu_kernel void @global_agent_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_agent_acq_rel_seq_cst_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_agent_acq_rel_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -12072,28 +12059,26 @@ define amdgpu_kernel void @global_agent_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_agent_seq_cst_seq_cst_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_agent_seq_cst_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -12918,23 +12903,19 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_load( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_agent_one_as_seq_cst_load: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_agent_one_as_seq_cst_load: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { entry: %val = load atomic i32, ptr addrspace(1) %in syncscope("agent-one-as") seq_cst, align 4 @@ -13093,15 +13074,16 @@ define amdgpu_kernel void @global_agent_one_as_unordered_store( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_agent_one_as_unordered_store: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_agent_one_as_unordered_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { entry: store atomic i32 %in, ptr addrspace(1) %out syncscope("agent-one-as") unordered, align 4 @@ -13259,15 +13241,16 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_store( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_agent_one_as_monotonic_store: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_agent_one_as_monotonic_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { entry: store atomic i32 %in, ptr addrspace(1) %out syncscope("agent-one-as") monotonic, align 4 @@ -13450,19 +13433,19 @@ define amdgpu_kernel void @global_agent_one_as_release_store( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_agent_one_as_release_store: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_agent_one_as_release_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { entry: store atomic i32 %in, ptr addrspace(1) %out syncscope("agent-one-as") release, align 4 @@ -13645,19 +13628,19 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_store( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_agent_one_as_seq_cst_store: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_agent_one_as_seq_cst_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { entry: store atomic i32 %in, ptr addrspace(1) %out syncscope("agent-one-as") seq_cst, align 4 @@ -13813,15 +13796,16 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_atomicrmw( ; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_agent_one_as_monotonic_atomicrmw: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_agent_one_as_monotonic_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("agent-one-as") monotonic @@ -14006,17 +13990,18 @@ define amdgpu_kernel void @global_agent_one_as_acquire_atomicrmw( ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_agent_one_as_acquire_atomicrmw: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_agent_one_as_acquire_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("agent-one-as") acquire @@ -14197,19 +14182,19 @@ define amdgpu_kernel void @global_agent_one_as_release_atomicrmw( ; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_agent_one_as_release_atomicrmw: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_agent_one_as_release_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("agent-one-as") release @@ -14419,21 +14404,21 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_atomicrmw( ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_agent_one_as_acq_rel_atomicrmw: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_agent_one_as_acq_rel_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("agent-one-as") acq_rel @@ -14643,21 +14628,21 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_atomicrmw( ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_agent_one_as_seq_cst_atomicrmw: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_agent_one_as_seq_cst_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("agent-one-as") seq_cst @@ -14857,18 +14842,19 @@ define amdgpu_kernel void @global_agent_one_as_acquire_ret_atomicrmw( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_agent_one_as_acquire_ret_atomicrmw: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_agent_one_as_acquire_ret_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("agent-one-as") acquire @@ -15098,24 +15084,22 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_ret_atomicrmw( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_agent_one_as_acq_rel_ret_atomicrmw: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_agent_one_as_acq_rel_ret_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("agent-one-as") acq_rel @@ -15345,24 +15329,22 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_ret_atomicrmw( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_agent_one_as_seq_cst_ret_atomicrmw: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_agent_one_as_seq_cst_ret_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("agent-one-as") seq_cst @@ -15583,19 +15565,20 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_monotonic_cmpxchg( ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_agent_one_as_monotonic_monotonic_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_agent_one_as_monotonic_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -15845,21 +15828,22 @@ define amdgpu_kernel void @global_agent_one_as_acquire_monotonic_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_agent_one_as_acquire_monotonic_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_agent_one_as_acquire_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -16105,23 +16089,23 @@ define amdgpu_kernel void @global_agent_one_as_release_monotonic_cmpxchg( ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_agent_one_as_release_monotonic_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_agent_one_as_release_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -16396,25 +16380,25 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_monotonic_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_agent_one_as_acq_rel_monotonic_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_agent_one_as_acq_rel_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -16689,25 +16673,25 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_monotonic_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_agent_one_as_seq_cst_monotonic_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_agent_one_as_seq_cst_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -16957,21 +16941,22 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_acquire_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_agent_one_as_monotonic_acquire_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_agent_one_as_monotonic_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -17221,21 +17206,22 @@ define amdgpu_kernel void @global_agent_one_as_acquire_acquire_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_agent_one_as_acquire_acquire_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_agent_one_as_acquire_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -17510,25 +17496,25 @@ define amdgpu_kernel void @global_agent_one_as_release_acquire_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_agent_one_as_release_acquire_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_agent_one_as_release_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -17803,25 +17789,25 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_acquire_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_agent_one_as_acq_rel_acquire_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_agent_one_as_acq_rel_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -18096,25 +18082,25 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_acquire_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_agent_one_as_seq_cst_acquire_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_agent_one_as_seq_cst_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -18389,25 +18375,25 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_seq_cst_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_agent_one_as_monotonic_seq_cst_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_agent_one_as_monotonic_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -18682,25 +18668,25 @@ define amdgpu_kernel void @global_agent_one_as_acquire_seq_cst_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_agent_one_as_acquire_seq_cst_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_agent_one_as_acquire_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -18975,25 +18961,25 @@ define amdgpu_kernel void @global_agent_one_as_release_seq_cst_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_agent_one_as_release_seq_cst_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_agent_one_as_release_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -19268,25 +19254,25 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_seq_cst_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_agent_one_as_acq_rel_seq_cst_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_agent_one_as_acq_rel_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -19561,25 +19547,25 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_seq_cst_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_agent_one_as_seq_cst_seq_cst_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_agent_one_as_seq_cst_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -19830,21 +19816,22 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_agent_one_as_monotonic_monotonic_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_agent_one_as_monotonic_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -20113,22 +20100,23 @@ define amdgpu_kernel void @global_agent_one_as_acquire_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_agent_one_as_acquire_monotonic_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_agent_one_as_acquire_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -20426,28 +20414,26 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_agent_one_as_acq_rel_monotonic_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_agent_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -20745,28 +20731,26 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_agent_one_as_seq_cst_monotonic_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_agent_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -21039,24 +21023,23 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_agent_one_as_monotonic_acquire_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_agent_one_as_monotonic_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -21325,22 +21308,23 @@ define amdgpu_kernel void @global_agent_one_as_acquire_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_agent_one_as_acquire_acquire_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_agent_one_as_acquire_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -21638,28 +21622,26 @@ define amdgpu_kernel void @global_agent_one_as_release_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_agent_one_as_release_acquire_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_agent_one_as_release_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -21957,28 +21939,26 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_agent_one_as_acq_rel_acquire_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_agent_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -22276,28 +22256,26 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_agent_one_as_seq_cst_acquire_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_agent_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -22595,28 +22573,26 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_agent_one_as_monotonic_seq_cst_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_agent_one_as_monotonic_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -22910,26 +22886,26 @@ define amdgpu_kernel void @global_agent_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_agent_one_as_acquire_seq_cst_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_agent_one_as_acquire_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -23227,28 +23203,26 @@ define amdgpu_kernel void @global_agent_one_as_release_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_agent_one_as_release_seq_cst_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_agent_one_as_release_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -23546,28 +23520,26 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_agent_one_as_acq_rel_seq_cst_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_agent_one_as_acq_rel_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -23865,28 +23837,26 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_agent_one_as_seq_cst_seq_cst_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_agent_one_as_seq_cst_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-lastuse.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-lastuse.ll index ed2d62356f8f2..bda702156905a 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-lastuse.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-lastuse.ll @@ -87,18 +87,16 @@ define amdgpu_kernel void @global_last_use_and_volatile_load(ptr addrspace(1) %i ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-NEXT: s_endpgm ; -; GFX1250-LABEL: global_last_use_and_volatile_load: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: global_load_b32 v1, v0, s[2:3] th:TH_LOAD_BYPASS scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_last_use_and_volatile_load: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: global_load_b32 v1, v0, s[2:3] th:TH_LOAD_BYPASS scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm entry: %val = load volatile i32, ptr addrspace(1) %in, align 4, !amdgpu.last.use !{} store i32 %val, ptr addrspace(1) %out diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-nontemporal.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-nontemporal.ll index c1bfe21865c15..4575cbbfd839e 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-nontemporal.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-nontemporal.ll @@ -1105,18 +1105,16 @@ define amdgpu_kernel void @global_nontemporal_volatile_load( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_nontemporal_volatile_load: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: global_load_b32 v1, v0, s[2:3] th:TH_LOAD_NT scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_nontemporal_volatile_load: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: global_load_b32 v1, v0, s[2:3] th:TH_LOAD_NT scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { entry: %val = load volatile i32, ptr addrspace(1) %in, align 4, !nontemporal !0 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-singlethread.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-singlethread.ll index 6a5a6e01c741b..4f2ea4493560f 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-singlethread.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-singlethread.ll @@ -945,15 +945,16 @@ define amdgpu_kernel void @global_singlethread_unordered_store( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_singlethread_unordered_store: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_singlethread_unordered_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { entry: store atomic i32 %in, ptr addrspace(1) %out syncscope("singlethread") unordered, align 4 @@ -1111,15 +1112,16 @@ define amdgpu_kernel void @global_singlethread_monotonic_store( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_singlethread_monotonic_store: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_singlethread_monotonic_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { entry: store atomic i32 %in, ptr addrspace(1) %out syncscope("singlethread") monotonic, align 4 @@ -1277,15 +1279,16 @@ define amdgpu_kernel void @global_singlethread_release_store( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_singlethread_release_store: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_singlethread_release_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { entry: store atomic i32 %in, ptr addrspace(1) %out syncscope("singlethread") release, align 4 @@ -1443,15 +1446,16 @@ define amdgpu_kernel void @global_singlethread_seq_cst_store( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_singlethread_seq_cst_store: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_singlethread_seq_cst_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { entry: store atomic i32 %in, ptr addrspace(1) %out syncscope("singlethread") seq_cst, align 4 @@ -1607,15 +1611,16 @@ define amdgpu_kernel void @global_singlethread_monotonic_atomicrmw( ; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_singlethread_monotonic_atomicrmw: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_singlethread_monotonic_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("singlethread") monotonic @@ -1771,15 +1776,16 @@ define amdgpu_kernel void @global_singlethread_acquire_atomicrmw( ; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_singlethread_acquire_atomicrmw: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_singlethread_acquire_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("singlethread") acquire @@ -1935,15 +1941,16 @@ define amdgpu_kernel void @global_singlethread_release_atomicrmw( ; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_singlethread_release_atomicrmw: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_singlethread_release_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("singlethread") release @@ -2099,15 +2106,16 @@ define amdgpu_kernel void @global_singlethread_acq_rel_atomicrmw( ; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_singlethread_acq_rel_atomicrmw: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_singlethread_acq_rel_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("singlethread") acq_rel @@ -2263,15 +2271,16 @@ define amdgpu_kernel void @global_singlethread_seq_cst_atomicrmw( ; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_singlethread_seq_cst_atomicrmw: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_singlethread_seq_cst_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("singlethread") seq_cst @@ -2455,17 +2464,18 @@ define amdgpu_kernel void @global_singlethread_acquire_ret_atomicrmw( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_singlethread_acquire_ret_atomicrmw: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_singlethread_acquire_ret_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("singlethread") acquire @@ -2650,17 +2660,18 @@ define amdgpu_kernel void @global_singlethread_acq_rel_ret_atomicrmw( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_singlethread_acq_rel_ret_atomicrmw: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_singlethread_acq_rel_ret_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("singlethread") acq_rel @@ -2845,17 +2856,18 @@ define amdgpu_kernel void @global_singlethread_seq_cst_ret_atomicrmw( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_singlethread_seq_cst_ret_atomicrmw: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_singlethread_seq_cst_ret_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("singlethread") seq_cst @@ -3076,19 +3088,20 @@ define amdgpu_kernel void @global_singlethread_monotonic_monotonic_cmpxchg( ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_singlethread_monotonic_monotonic_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_singlethread_monotonic_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -3309,19 +3322,20 @@ define amdgpu_kernel void @global_singlethread_acquire_monotonic_cmpxchg( ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_singlethread_acquire_monotonic_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_singlethread_acquire_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -3542,19 +3556,20 @@ define amdgpu_kernel void @global_singlethread_release_monotonic_cmpxchg( ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_singlethread_release_monotonic_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_singlethread_release_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -3775,19 +3790,20 @@ define amdgpu_kernel void @global_singlethread_acq_rel_monotonic_cmpxchg( ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_singlethread_acq_rel_monotonic_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_singlethread_acq_rel_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -4008,19 +4024,20 @@ define amdgpu_kernel void @global_singlethread_seq_cst_monotonic_cmpxchg( ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_singlethread_seq_cst_monotonic_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_singlethread_seq_cst_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -4241,19 +4258,20 @@ define amdgpu_kernel void @global_singlethread_monotonic_acquire_cmpxchg( ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_singlethread_monotonic_acquire_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_singlethread_monotonic_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -4474,19 +4492,20 @@ define amdgpu_kernel void @global_singlethread_acquire_acquire_cmpxchg( ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_singlethread_acquire_acquire_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_singlethread_acquire_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -4707,19 +4726,20 @@ define amdgpu_kernel void @global_singlethread_release_acquire_cmpxchg( ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_singlethread_release_acquire_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_singlethread_release_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -4940,19 +4960,20 @@ define amdgpu_kernel void @global_singlethread_acq_rel_acquire_cmpxchg( ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_singlethread_acq_rel_acquire_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_singlethread_acq_rel_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -5173,19 +5194,20 @@ define amdgpu_kernel void @global_singlethread_seq_cst_acquire_cmpxchg( ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_singlethread_seq_cst_acquire_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_singlethread_seq_cst_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -5406,19 +5428,20 @@ define amdgpu_kernel void @global_singlethread_monotonic_seq_cst_cmpxchg( ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_singlethread_monotonic_seq_cst_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_singlethread_monotonic_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -5639,19 +5662,20 @@ define amdgpu_kernel void @global_singlethread_acquire_seq_cst_cmpxchg( ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_singlethread_acquire_seq_cst_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_singlethread_acquire_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -5872,19 +5896,20 @@ define amdgpu_kernel void @global_singlethread_release_seq_cst_cmpxchg( ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_singlethread_release_seq_cst_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_singlethread_release_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -6105,19 +6130,20 @@ define amdgpu_kernel void @global_singlethread_acq_rel_seq_cst_cmpxchg( ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_singlethread_acq_rel_seq_cst_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_singlethread_acq_rel_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -6338,19 +6364,20 @@ define amdgpu_kernel void @global_singlethread_seq_cst_seq_cst_cmpxchg( ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_singlethread_seq_cst_seq_cst_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_singlethread_seq_cst_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -6601,21 +6628,22 @@ define amdgpu_kernel void @global_singlethread_monotonic_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_singlethread_monotonic_monotonic_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_singlethread_monotonic_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -6868,21 +6896,22 @@ define amdgpu_kernel void @global_singlethread_acquire_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_singlethread_acquire_monotonic_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_singlethread_acquire_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -7135,21 +7164,22 @@ define amdgpu_kernel void @global_singlethread_release_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_singlethread_release_monotonic_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_singlethread_release_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -7402,21 +7432,22 @@ define amdgpu_kernel void @global_singlethread_acq_rel_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_singlethread_acq_rel_monotonic_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_singlethread_acq_rel_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -7669,21 +7700,22 @@ define amdgpu_kernel void @global_singlethread_seq_cst_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_singlethread_seq_cst_monotonic_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_singlethread_seq_cst_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -7936,21 +7968,22 @@ define amdgpu_kernel void @global_singlethread_monotonic_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_singlethread_monotonic_acquire_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_singlethread_monotonic_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -8203,21 +8236,22 @@ define amdgpu_kernel void @global_singlethread_acquire_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_singlethread_acquire_acquire_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_singlethread_acquire_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -8470,21 +8504,22 @@ define amdgpu_kernel void @global_singlethread_release_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_singlethread_release_acquire_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_singlethread_release_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -8737,21 +8772,22 @@ define amdgpu_kernel void @global_singlethread_acq_rel_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_singlethread_acq_rel_acquire_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_singlethread_acq_rel_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -9004,21 +9040,22 @@ define amdgpu_kernel void @global_singlethread_seq_cst_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_singlethread_seq_cst_acquire_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_singlethread_seq_cst_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -9271,21 +9308,22 @@ define amdgpu_kernel void @global_singlethread_monotonic_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_singlethread_monotonic_seq_cst_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_singlethread_monotonic_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -9538,21 +9576,22 @@ define amdgpu_kernel void @global_singlethread_acquire_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_singlethread_acquire_seq_cst_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_singlethread_acquire_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -9805,21 +9844,22 @@ define amdgpu_kernel void @global_singlethread_release_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_singlethread_release_seq_cst_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_singlethread_release_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -10072,21 +10112,22 @@ define amdgpu_kernel void @global_singlethread_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_singlethread_acq_rel_seq_cst_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_singlethread_acq_rel_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -10339,21 +10380,22 @@ define amdgpu_kernel void @global_singlethread_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_singlethread_seq_cst_seq_cst_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_singlethread_seq_cst_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -11294,15 +11336,16 @@ define amdgpu_kernel void @global_singlethread_one_as_unordered_store( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_singlethread_one_as_unordered_store: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_singlethread_one_as_unordered_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { entry: store atomic i32 %in, ptr addrspace(1) %out syncscope("singlethread-one-as") unordered, align 4 @@ -11460,15 +11503,16 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_store( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_singlethread_one_as_monotonic_store: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_singlethread_one_as_monotonic_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { entry: store atomic i32 %in, ptr addrspace(1) %out syncscope("singlethread-one-as") monotonic, align 4 @@ -11626,15 +11670,16 @@ define amdgpu_kernel void @global_singlethread_one_as_release_store( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_singlethread_one_as_release_store: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_singlethread_one_as_release_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { entry: store atomic i32 %in, ptr addrspace(1) %out syncscope("singlethread-one-as") release, align 4 @@ -11792,15 +11837,16 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_store( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_singlethread_one_as_seq_cst_store: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_singlethread_one_as_seq_cst_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { entry: store atomic i32 %in, ptr addrspace(1) %out syncscope("singlethread-one-as") seq_cst, align 4 @@ -11956,15 +12002,16 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_atomicrmw( ; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_singlethread_one_as_monotonic_atomicrmw: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_singlethread_one_as_monotonic_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("singlethread-one-as") monotonic @@ -12120,15 +12167,16 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_atomicrmw( ; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_singlethread_one_as_acquire_atomicrmw: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_singlethread_one_as_acquire_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("singlethread-one-as") acquire @@ -12284,15 +12332,16 @@ define amdgpu_kernel void @global_singlethread_one_as_release_atomicrmw( ; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_singlethread_one_as_release_atomicrmw: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_singlethread_one_as_release_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("singlethread-one-as") release @@ -12448,15 +12497,16 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_atomicrmw( ; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_singlethread_one_as_acq_rel_atomicrmw: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_singlethread_one_as_acq_rel_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("singlethread-one-as") acq_rel @@ -12612,15 +12662,16 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_atomicrmw( ; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_singlethread_one_as_seq_cst_atomicrmw: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_singlethread_one_as_seq_cst_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("singlethread-one-as") seq_cst @@ -12804,17 +12855,18 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_ret_atomicrmw( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_singlethread_one_as_acquire_ret_atomicrmw: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_singlethread_one_as_acquire_ret_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("singlethread-one-as") acquire @@ -12999,17 +13051,18 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_ret_atomicrmw( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_singlethread_one_as_acq_rel_ret_atomicrmw: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_singlethread_one_as_acq_rel_ret_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("singlethread-one-as") acq_rel @@ -13194,17 +13247,18 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_ret_atomicrmw( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_singlethread_one_as_seq_cst_ret_atomicrmw: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_singlethread_one_as_seq_cst_ret_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("singlethread-one-as") seq_cst @@ -13425,19 +13479,20 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_monotonic_cmpxch ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_singlethread_one_as_monotonic_monotonic_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_singlethread_one_as_monotonic_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -13658,19 +13713,20 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_monotonic_cmpxchg( ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_singlethread_one_as_acquire_monotonic_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_singlethread_one_as_acquire_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -13891,19 +13947,20 @@ define amdgpu_kernel void @global_singlethread_one_as_release_monotonic_cmpxchg( ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_singlethread_one_as_release_monotonic_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_singlethread_one_as_release_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -14124,19 +14181,20 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_monotonic_cmpxchg( ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_singlethread_one_as_acq_rel_monotonic_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_singlethread_one_as_acq_rel_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -14357,19 +14415,20 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_monotonic_cmpxchg( ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_singlethread_one_as_seq_cst_monotonic_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_singlethread_one_as_seq_cst_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -14590,19 +14649,20 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_acquire_cmpxchg( ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_singlethread_one_as_monotonic_acquire_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_singlethread_one_as_monotonic_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -14823,19 +14883,20 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_acquire_cmpxchg( ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_singlethread_one_as_acquire_acquire_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_singlethread_one_as_acquire_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -15056,19 +15117,20 @@ define amdgpu_kernel void @global_singlethread_one_as_release_acquire_cmpxchg( ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_singlethread_one_as_release_acquire_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_singlethread_one_as_release_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -15289,19 +15351,20 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_acquire_cmpxchg( ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_singlethread_one_as_acq_rel_acquire_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_singlethread_one_as_acq_rel_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -15522,19 +15585,20 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_acquire_cmpxchg( ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_singlethread_one_as_seq_cst_acquire_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_singlethread_one_as_seq_cst_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -15755,19 +15819,20 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_seq_cst_cmpxchg( ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_singlethread_one_as_monotonic_seq_cst_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_singlethread_one_as_monotonic_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -15988,19 +16053,20 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_seq_cst_cmpxchg( ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_singlethread_one_as_acquire_seq_cst_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_singlethread_one_as_acquire_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -16221,19 +16287,20 @@ define amdgpu_kernel void @global_singlethread_one_as_release_seq_cst_cmpxchg( ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_singlethread_one_as_release_seq_cst_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_singlethread_one_as_release_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -16454,19 +16521,20 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_seq_cst_cmpxchg( ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_singlethread_one_as_acq_rel_seq_cst_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_singlethread_one_as_acq_rel_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -16687,19 +16755,20 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_seq_cst_cmpxchg( ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_singlethread_one_as_seq_cst_seq_cst_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_singlethread_one_as_seq_cst_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -16950,21 +17019,22 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_monotonic_ret_cm ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_singlethread_one_as_monotonic_monotonic_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_singlethread_one_as_monotonic_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -17217,21 +17287,22 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_monotonic_ret_cmpx ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_singlethread_one_as_acquire_monotonic_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_singlethread_one_as_acquire_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -17484,21 +17555,22 @@ define amdgpu_kernel void @global_singlethread_one_as_release_monotonic_ret_cmpx ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_singlethread_one_as_release_monotonic_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_singlethread_one_as_release_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -17751,21 +17823,22 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_monotonic_ret_cmpx ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -18018,21 +18091,22 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_monotonic_ret_cmpx ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -18285,21 +18359,22 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_acquire_ret_cmpx ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_singlethread_one_as_monotonic_acquire_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_singlethread_one_as_monotonic_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -18552,21 +18627,22 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_acquire_ret_cmpxch ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_singlethread_one_as_acquire_acquire_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_singlethread_one_as_acquire_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -18819,21 +18895,22 @@ define amdgpu_kernel void @global_singlethread_one_as_release_acquire_ret_cmpxch ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_singlethread_one_as_release_acquire_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_singlethread_one_as_release_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -19086,21 +19163,22 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_acquire_ret_cmpxch ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_singlethread_one_as_acq_rel_acquire_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_singlethread_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -19353,21 +19431,22 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_acquire_ret_cmpxch ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_singlethread_one_as_seq_cst_acquire_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_singlethread_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -19620,21 +19699,22 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_seq_cst_ret_cmpx ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -19887,21 +19967,22 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_seq_cst_ret_cmpxch ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_singlethread_one_as_acquire_seq_cst_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_singlethread_one_as_acquire_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -20154,21 +20235,22 @@ define amdgpu_kernel void @global_singlethread_one_as_release_seq_cst_ret_cmpxch ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_singlethread_one_as_release_seq_cst_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_singlethread_one_as_release_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -20421,21 +20503,22 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_seq_cst_ret_cmpxch ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -20688,21 +20771,22 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_seq_cst_ret_cmpxch ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-system.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-system.ll index 7ddd515830e11..c8a45deccb462 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-system.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-system.ll @@ -833,23 +833,19 @@ define amdgpu_kernel void @global_system_seq_cst_load( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_system_seq_cst_load: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_system_seq_cst_load: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { entry: %val = load atomic i32, ptr addrspace(1) %in seq_cst, align 4 @@ -1008,15 +1004,16 @@ define amdgpu_kernel void @global_system_unordered_store( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_system_unordered_store: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_system_unordered_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { entry: store atomic i32 %in, ptr addrspace(1) %out unordered, align 4 @@ -1174,15 +1171,16 @@ define amdgpu_kernel void @global_system_monotonic_store( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_system_monotonic_store: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_system_monotonic_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { entry: store atomic i32 %in, ptr addrspace(1) %out monotonic, align 4 @@ -1369,20 +1367,19 @@ define amdgpu_kernel void @global_system_release_store( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_system_release_store: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_system_release_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { entry: store atomic i32 %in, ptr addrspace(1) %out release, align 4 @@ -1569,20 +1566,19 @@ define amdgpu_kernel void @global_system_seq_cst_store( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_system_seq_cst_store: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_system_seq_cst_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { entry: store atomic i32 %in, ptr addrspace(1) %out seq_cst, align 4 @@ -1738,15 +1734,16 @@ define amdgpu_kernel void @global_system_monotonic_atomicrmw( ; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_system_monotonic_atomicrmw: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_system_monotonic_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in monotonic @@ -1933,17 +1930,18 @@ define amdgpu_kernel void @global_system_acquire_atomicrmw( ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_system_acquire_atomicrmw: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_system_acquire_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in acquire @@ -2128,20 +2126,19 @@ define amdgpu_kernel void @global_system_release_atomicrmw( ; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_system_release_atomicrmw: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_system_release_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in release @@ -2357,22 +2354,21 @@ define amdgpu_kernel void @global_system_acq_rel_atomicrmw( ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_system_acq_rel_atomicrmw: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_system_acq_rel_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in acq_rel @@ -2588,22 +2584,21 @@ define amdgpu_kernel void @global_system_seq_cst_atomicrmw( ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_system_seq_cst_atomicrmw: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_system_seq_cst_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in seq_cst @@ -2805,18 +2800,19 @@ define amdgpu_kernel void @global_system_acquire_ret_atomicrmw( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_system_acquire_ret_atomicrmw: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_system_acquire_ret_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in acquire @@ -3052,25 +3048,22 @@ define amdgpu_kernel void @global_system_acq_rel_ret_atomicrmw( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_system_acq_rel_ret_atomicrmw: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_system_acq_rel_ret_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in acq_rel @@ -3306,25 +3299,22 @@ define amdgpu_kernel void @global_system_seq_cst_ret_atomicrmw( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_system_seq_cst_ret_atomicrmw: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_system_seq_cst_ret_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in seq_cst @@ -3545,19 +3535,20 @@ define amdgpu_kernel void @global_system_monotonic_monotonic_cmpxchg( ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_system_monotonic_monotonic_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_system_monotonic_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -3809,21 +3800,22 @@ define amdgpu_kernel void @global_system_acquire_monotonic_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_system_acquire_monotonic_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_system_acquire_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -4073,24 +4065,23 @@ define amdgpu_kernel void @global_system_release_monotonic_cmpxchg( ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_system_release_monotonic_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_system_release_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -4371,26 +4362,25 @@ define amdgpu_kernel void @global_system_acq_rel_monotonic_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_system_acq_rel_monotonic_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_system_acq_rel_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -4671,26 +4661,25 @@ define amdgpu_kernel void @global_system_seq_cst_monotonic_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_system_seq_cst_monotonic_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_system_seq_cst_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -4942,21 +4931,22 @@ define amdgpu_kernel void @global_system_monotonic_acquire_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_system_monotonic_acquire_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_system_monotonic_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -5208,21 +5198,22 @@ define amdgpu_kernel void @global_system_acquire_acquire_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_system_acquire_acquire_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_system_acquire_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -5503,26 +5494,25 @@ define amdgpu_kernel void @global_system_release_acquire_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_system_release_acquire_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_system_release_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -5803,26 +5793,25 @@ define amdgpu_kernel void @global_system_acq_rel_acquire_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_system_acq_rel_acquire_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_system_acq_rel_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -6103,26 +6092,25 @@ define amdgpu_kernel void @global_system_seq_cst_acquire_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_system_seq_cst_acquire_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_system_seq_cst_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -6403,26 +6391,25 @@ define amdgpu_kernel void @global_system_seq_cst_seq_cst_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_system_seq_cst_seq_cst_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_system_seq_cst_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -6673,21 +6660,22 @@ define amdgpu_kernel void @global_system_monotonic_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_system_monotonic_monotonic_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_system_monotonic_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -6958,22 +6946,23 @@ define amdgpu_kernel void @global_system_acquire_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_system_acquire_monotonic_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_system_acquire_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -7277,29 +7266,26 @@ define amdgpu_kernel void @global_system_acq_rel_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_system_acq_rel_monotonic_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_system_acq_rel_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -7603,29 +7589,26 @@ define amdgpu_kernel void @global_system_seq_cst_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_system_seq_cst_monotonic_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_system_seq_cst_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -7900,24 +7883,23 @@ define amdgpu_kernel void @global_system_monotonic_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_system_monotonic_acquire_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_system_monotonic_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -8188,22 +8170,23 @@ define amdgpu_kernel void @global_system_acquire_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_system_acquire_acquire_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_system_acquire_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -8507,29 +8490,26 @@ define amdgpu_kernel void @global_system_release_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_system_release_acquire_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_system_release_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -8833,29 +8813,26 @@ define amdgpu_kernel void @global_system_acq_rel_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_system_acq_rel_acquire_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_system_acq_rel_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -9159,29 +9136,26 @@ define amdgpu_kernel void @global_system_seq_cst_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_system_seq_cst_acquire_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_system_seq_cst_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -9485,29 +9459,26 @@ define amdgpu_kernel void @global_system_monotonic_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_system_monotonic_seq_cst_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_system_monotonic_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -9807,27 +9778,26 @@ define amdgpu_kernel void @global_system_acquire_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_system_acquire_seq_cst_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_system_acquire_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -10131,29 +10101,26 @@ define amdgpu_kernel void @global_system_relese_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_system_relese_seq_cst_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_system_relese_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -10457,29 +10424,26 @@ define amdgpu_kernel void @global_system_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_system_acq_rel_seq_cst_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_system_acq_rel_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -10783,29 +10747,26 @@ define amdgpu_kernel void @global_system_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_system_seq_cst_seq_cst_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_system_seq_cst_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -11634,23 +11595,19 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_load( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_system_one_as_seq_cst_load: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_system_one_as_seq_cst_load: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { entry: %val = load atomic i32, ptr addrspace(1) %in syncscope("one-as") seq_cst, align 4 @@ -11809,15 +11766,16 @@ define amdgpu_kernel void @global_system_one_as_unordered_store( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_system_one_as_unordered_store: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_system_one_as_unordered_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { entry: store atomic i32 %in, ptr addrspace(1) %out syncscope("one-as") unordered, align 4 @@ -11975,15 +11933,16 @@ define amdgpu_kernel void @global_system_one_as_monotonic_store( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_system_one_as_monotonic_store: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_system_one_as_monotonic_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { entry: store atomic i32 %in, ptr addrspace(1) %out syncscope("one-as") monotonic, align 4 @@ -12170,20 +12129,19 @@ define amdgpu_kernel void @global_system_one_as_release_store( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_system_one_as_release_store: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_system_one_as_release_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { entry: store atomic i32 %in, ptr addrspace(1) %out syncscope("one-as") release, align 4 @@ -12370,20 +12328,19 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_store( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_system_one_as_seq_cst_store: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_system_one_as_seq_cst_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { entry: store atomic i32 %in, ptr addrspace(1) %out syncscope("one-as") seq_cst, align 4 @@ -12539,15 +12496,16 @@ define amdgpu_kernel void @global_system_one_as_monotonic_atomicrmw( ; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_system_one_as_monotonic_atomicrmw: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_system_one_as_monotonic_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("one-as") monotonic @@ -12734,17 +12692,18 @@ define amdgpu_kernel void @global_system_one_as_acquire_atomicrmw( ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_system_one_as_acquire_atomicrmw: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_system_one_as_acquire_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("one-as") acquire @@ -12929,20 +12888,19 @@ define amdgpu_kernel void @global_system_one_as_release_atomicrmw( ; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_system_one_as_release_atomicrmw: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_system_one_as_release_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("one-as") release @@ -13158,22 +13116,21 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_atomicrmw( ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_system_one_as_acq_rel_atomicrmw: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_system_one_as_acq_rel_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("one-as") acq_rel @@ -13389,22 +13346,21 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_atomicrmw( ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_system_one_as_seq_cst_atomicrmw: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_system_one_as_seq_cst_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("one-as") seq_cst @@ -13606,18 +13562,19 @@ define amdgpu_kernel void @global_system_one_as_acquire_ret_atomicrmw( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_system_one_as_acquire_ret_atomicrmw: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_system_one_as_acquire_ret_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("one-as") acquire @@ -13853,25 +13810,22 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_ret_atomicrmw( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_system_one_as_acq_rel_ret_atomicrmw: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_system_one_as_acq_rel_ret_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("one-as") acq_rel @@ -14107,25 +14061,22 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_ret_atomicrmw( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_system_one_as_seq_cst_ret_atomicrmw: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_system_one_as_seq_cst_ret_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("one-as") seq_cst @@ -14346,19 +14297,20 @@ define amdgpu_kernel void @global_system_one_as_monotonic_monotonic_cmpxchg( ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_system_one_as_monotonic_monotonic_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_system_one_as_monotonic_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -14610,21 +14562,22 @@ define amdgpu_kernel void @global_system_one_as_acquire_monotonic_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_system_one_as_acquire_monotonic_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_system_one_as_acquire_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -14874,24 +14827,23 @@ define amdgpu_kernel void @global_system_one_as_release_monotonic_cmpxchg( ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_system_one_as_release_monotonic_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_system_one_as_release_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -15172,26 +15124,25 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_monotonic_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_system_one_as_acq_rel_monotonic_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_system_one_as_acq_rel_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -15472,26 +15423,25 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_monotonic_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_system_one_as_seq_cst_monotonic_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_system_one_as_seq_cst_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -15743,21 +15693,22 @@ define amdgpu_kernel void @global_system_one_as_monotonic_acquire_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_system_one_as_monotonic_acquire_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_system_one_as_monotonic_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -16009,21 +15960,22 @@ define amdgpu_kernel void @global_system_one_as_acquire_acquire_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_system_one_as_acquire_acquire_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_system_one_as_acquire_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -16304,26 +16256,25 @@ define amdgpu_kernel void @global_system_one_as_release_acquire_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_system_one_as_release_acquire_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_system_one_as_release_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -16604,26 +16555,25 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_acquire_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_system_one_as_acq_rel_acquire_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_system_one_as_acq_rel_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -16904,26 +16854,25 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_acquire_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_system_one_as_seq_cst_acquire_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_system_one_as_seq_cst_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -17204,26 +17153,25 @@ define amdgpu_kernel void @global_system_one_as_monotonic_seq_cst_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_system_one_as_monotonic_seq_cst_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_system_one_as_monotonic_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -17504,26 +17452,25 @@ define amdgpu_kernel void @global_system_one_as_acquire_seq_cst_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_system_one_as_acquire_seq_cst_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_system_one_as_acquire_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -17804,26 +17751,25 @@ define amdgpu_kernel void @global_system_one_as_release_seq_cst_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_system_one_as_release_seq_cst_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_system_one_as_release_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -18104,26 +18050,25 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_seq_cst_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_system_one_as_acq_rel_seq_cst_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_system_one_as_acq_rel_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -18404,26 +18349,25 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_seq_cst_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_system_one_as_seq_cst_seq_cst_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_system_one_as_seq_cst_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -18674,21 +18618,22 @@ define amdgpu_kernel void @global_system_one_as_monotonic_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_system_one_as_monotonic_monotonic_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_system_one_as_monotonic_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -18959,22 +18904,23 @@ define amdgpu_kernel void @global_system_one_as_acquire_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_system_one_as_acquire_monotonic_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_system_one_as_acquire_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -19256,26 +19202,25 @@ define amdgpu_kernel void @global_system_one_as_release_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_system_one_as_release_monotonic_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_system_one_as_release_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -19579,29 +19524,26 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_system_one_as_acq_rel_monotonic_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_system_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -19905,29 +19847,26 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_system_one_as_seq_cst_monotonic_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_system_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -20202,24 +20141,23 @@ define amdgpu_kernel void @global_system_one_as_monotonic_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_system_one_as_monotonic_acquire_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_system_one_as_monotonic_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -20490,22 +20428,23 @@ define amdgpu_kernel void @global_system_one_as_acquire_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_system_one_as_acquire_acquire_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_system_one_as_acquire_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -20809,29 +20748,26 @@ define amdgpu_kernel void @global_system_one_as_release_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_system_one_as_release_acquire_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_system_one_as_release_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -21135,29 +21071,26 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_system_one_as_acq_rel_acquire_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_system_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -21461,29 +21394,26 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_system_one_as_seq_cst_acquire_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_system_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -21787,29 +21717,26 @@ define amdgpu_kernel void @global_system_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_system_one_as_monotonic_seq_cst_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_system_one_as_monotonic_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -22109,27 +22036,26 @@ define amdgpu_kernel void @global_system_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_system_one_as_acquire_seq_cst_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_system_one_as_acquire_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -22433,29 +22359,26 @@ define amdgpu_kernel void @global_system_one_as_release_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_system_one_as_release_seq_cst_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_system_one_as_release_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -22759,29 +22682,26 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_system_one_as_acq_rel_seq_cst_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_system_one_as_acq_rel_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -23085,29 +23005,26 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_system_one_as_seq_cst_seq_cst_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_system_one_as_seq_cst_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-volatile.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-volatile.ll index 1539fb574c0bd..f4fdec7490117 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-volatile.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-volatile.ll @@ -148,18 +148,16 @@ define amdgpu_kernel void @global_volatile_load_0( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_volatile_load_0: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_volatile_load_0: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { entry: %val = load volatile i32, ptr addrspace(1) %in, align 4 @@ -357,22 +355,20 @@ define amdgpu_kernel void @global_volatile_load_1( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_volatile_load_1: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v1, v0 -; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_wait_xcnt 0x0 -; GFX1250-NEXT: s_mov_b32 s4, 0x3ff -; GFX1250-NEXT: v_and_b32_e64 v1, v1, s4 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: global_load_b32 v1, v1, s[2:3] scale_offset scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_volatile_load_1: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, v0 +; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_mov_b32 s4, 0x3ff +; GFX1250-CU-NEXT: v_and_b32_e64 v1, v1, s4 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: global_load_b32 v1, v1, s[2:3] scale_offset scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -1034,16 +1030,18 @@ define amdgpu_kernel void @global_volatile_workgroup_release_store( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_volatile_workgroup_release_store: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: s_wait_dscnt 0x0 -; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_volatile_workgroup_release_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { entry: store atomic volatile i32 %in, ptr addrspace(1) %out syncscope("workgroup") release, align 4 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-wavefront.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-wavefront.ll index 1aa8305b1a837..f66e6d00e6eab 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-wavefront.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-wavefront.ll @@ -945,15 +945,16 @@ define amdgpu_kernel void @global_wavefront_unordered_store( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_wavefront_unordered_store: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_wavefront_unordered_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { entry: store atomic i32 %in, ptr addrspace(1) %out syncscope("wavefront") unordered, align 4 @@ -1111,15 +1112,16 @@ define amdgpu_kernel void @global_wavefront_monotonic_store( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_wavefront_monotonic_store: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_wavefront_monotonic_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { entry: store atomic i32 %in, ptr addrspace(1) %out syncscope("wavefront") monotonic, align 4 @@ -1277,15 +1279,16 @@ define amdgpu_kernel void @global_wavefront_release_store( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_wavefront_release_store: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_wavefront_release_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { entry: store atomic i32 %in, ptr addrspace(1) %out syncscope("wavefront") release, align 4 @@ -1443,15 +1446,16 @@ define amdgpu_kernel void @global_wavefront_seq_cst_store( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_wavefront_seq_cst_store: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_wavefront_seq_cst_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { entry: store atomic i32 %in, ptr addrspace(1) %out syncscope("wavefront") seq_cst, align 4 @@ -1607,15 +1611,16 @@ define amdgpu_kernel void @global_wavefront_monotonic_atomicrmw( ; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_wavefront_monotonic_atomicrmw: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_wavefront_monotonic_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("wavefront") monotonic @@ -1771,15 +1776,16 @@ define amdgpu_kernel void @global_wavefront_acquire_atomicrmw( ; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_wavefront_acquire_atomicrmw: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_wavefront_acquire_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("wavefront") acquire @@ -1935,15 +1941,16 @@ define amdgpu_kernel void @global_wavefront_release_atomicrmw( ; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_wavefront_release_atomicrmw: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_wavefront_release_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("wavefront") release @@ -2099,15 +2106,16 @@ define amdgpu_kernel void @global_wavefront_acq_rel_atomicrmw( ; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_wavefront_acq_rel_atomicrmw: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_wavefront_acq_rel_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("wavefront") acq_rel @@ -2263,15 +2271,16 @@ define amdgpu_kernel void @global_wavefront_seq_cst_atomicrmw( ; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_wavefront_seq_cst_atomicrmw: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_wavefront_seq_cst_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("wavefront") seq_cst @@ -2455,17 +2464,18 @@ define amdgpu_kernel void @global_wavefront_acquire_ret_atomicrmw( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_wavefront_acquire_ret_atomicrmw: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_wavefront_acquire_ret_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("wavefront") acquire @@ -2650,17 +2660,18 @@ define amdgpu_kernel void @global_wavefront_acq_rel_ret_atomicrmw( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_wavefront_acq_rel_ret_atomicrmw: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_wavefront_acq_rel_ret_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("wavefront") acq_rel @@ -2845,17 +2856,18 @@ define amdgpu_kernel void @global_wavefront_seq_cst_ret_atomicrmw( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_wavefront_seq_cst_ret_atomicrmw: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_wavefront_seq_cst_ret_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("wavefront") seq_cst @@ -3076,19 +3088,20 @@ define amdgpu_kernel void @global_wavefront_monotonic_monotonic_cmpxchg( ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_wavefront_monotonic_monotonic_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_wavefront_monotonic_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -3309,19 +3322,20 @@ define amdgpu_kernel void @global_wavefront_acquire_monotonic_cmpxchg( ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_wavefront_acquire_monotonic_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_wavefront_acquire_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -3542,19 +3556,20 @@ define amdgpu_kernel void @global_wavefront_release_monotonic_cmpxchg( ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_wavefront_release_monotonic_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_wavefront_release_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -3775,19 +3790,20 @@ define amdgpu_kernel void @global_wavefront_acq_rel_monotonic_cmpxchg( ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_wavefront_acq_rel_monotonic_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_wavefront_acq_rel_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -4008,19 +4024,20 @@ define amdgpu_kernel void @global_wavefront_seq_cst_monotonic_cmpxchg( ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_wavefront_seq_cst_monotonic_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_wavefront_seq_cst_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -4241,19 +4258,20 @@ define amdgpu_kernel void @global_wavefront_monotonic_acquire_cmpxchg( ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_wavefront_monotonic_acquire_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_wavefront_monotonic_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -4474,19 +4492,20 @@ define amdgpu_kernel void @global_wavefront_acquire_acquire_cmpxchg( ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_wavefront_acquire_acquire_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_wavefront_acquire_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -4707,19 +4726,20 @@ define amdgpu_kernel void @global_wavefront_release_acquire_cmpxchg( ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_wavefront_release_acquire_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_wavefront_release_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -4940,19 +4960,20 @@ define amdgpu_kernel void @global_wavefront_acq_rel_acquire_cmpxchg( ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_wavefront_acq_rel_acquire_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_wavefront_acq_rel_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -5173,19 +5194,20 @@ define amdgpu_kernel void @global_wavefront_seq_cst_acquire_cmpxchg( ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_wavefront_seq_cst_acquire_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_wavefront_seq_cst_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -5406,19 +5428,20 @@ define amdgpu_kernel void @global_wavefront_monotonic_seq_cst_cmpxchg( ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_wavefront_monotonic_seq_cst_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_wavefront_monotonic_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -5639,19 +5662,20 @@ define amdgpu_kernel void @global_wavefront_acquire_seq_cst_cmpxchg( ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_wavefront_acquire_seq_cst_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_wavefront_acquire_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -5872,19 +5896,20 @@ define amdgpu_kernel void @global_wavefront_release_seq_cst_cmpxchg( ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_wavefront_release_seq_cst_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_wavefront_release_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -6105,19 +6130,20 @@ define amdgpu_kernel void @global_wavefront_acq_rel_seq_cst_cmpxchg( ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_wavefront_acq_rel_seq_cst_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_wavefront_acq_rel_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -6338,19 +6364,20 @@ define amdgpu_kernel void @global_wavefront_seq_cst_seq_cst_cmpxchg( ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_wavefront_seq_cst_seq_cst_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_wavefront_seq_cst_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -6601,21 +6628,22 @@ define amdgpu_kernel void @global_wavefront_monotonic_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_wavefront_monotonic_monotonic_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_wavefront_monotonic_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -6868,21 +6896,22 @@ define amdgpu_kernel void @global_wavefront_acquire_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_wavefront_acquire_monotonic_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_wavefront_acquire_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -7135,21 +7164,22 @@ define amdgpu_kernel void @global_wavefront_release_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_wavefront_release_monotonic_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_wavefront_release_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -7402,21 +7432,22 @@ define amdgpu_kernel void @global_wavefront_acq_rel_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_wavefront_acq_rel_monotonic_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_wavefront_acq_rel_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -7669,21 +7700,22 @@ define amdgpu_kernel void @global_wavefront_seq_cst_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_wavefront_seq_cst_monotonic_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_wavefront_seq_cst_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -7936,21 +7968,22 @@ define amdgpu_kernel void @global_wavefront_monotonic_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_wavefront_monotonic_acquire_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_wavefront_monotonic_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -8203,21 +8236,22 @@ define amdgpu_kernel void @global_wavefront_acquire_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_wavefront_acquire_acquire_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_wavefront_acquire_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -8470,21 +8504,22 @@ define amdgpu_kernel void @global_wavefront_release_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_wavefront_release_acquire_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_wavefront_release_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -8737,21 +8772,22 @@ define amdgpu_kernel void @global_wavefront_acq_rel_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_wavefront_acq_rel_acquire_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_wavefront_acq_rel_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -9004,21 +9040,22 @@ define amdgpu_kernel void @global_wavefront_seq_cst_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_wavefront_seq_cst_acquire_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_wavefront_seq_cst_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -9271,21 +9308,22 @@ define amdgpu_kernel void @global_wavefront_monotonic_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_wavefront_monotonic_seq_cst_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_wavefront_monotonic_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -9538,21 +9576,22 @@ define amdgpu_kernel void @global_wavefront_acquire_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_wavefront_acquire_seq_cst_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_wavefront_acquire_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -9805,21 +9844,22 @@ define amdgpu_kernel void @global_wavefront_release_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_wavefront_release_seq_cst_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_wavefront_release_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -10072,21 +10112,22 @@ define amdgpu_kernel void @global_wavefront_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_wavefront_acq_rel_seq_cst_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_wavefront_acq_rel_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -10339,21 +10380,22 @@ define amdgpu_kernel void @global_wavefront_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_wavefront_seq_cst_seq_cst_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_wavefront_seq_cst_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -11294,15 +11336,16 @@ define amdgpu_kernel void @global_wavefront_one_as_unordered_store( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_wavefront_one_as_unordered_store: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_wavefront_one_as_unordered_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { entry: store atomic i32 %in, ptr addrspace(1) %out syncscope("wavefront-one-as") unordered, align 4 @@ -11460,15 +11503,16 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_store( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_wavefront_one_as_monotonic_store: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_wavefront_one_as_monotonic_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { entry: store atomic i32 %in, ptr addrspace(1) %out syncscope("wavefront-one-as") monotonic, align 4 @@ -11626,15 +11670,16 @@ define amdgpu_kernel void @global_wavefront_one_as_release_store( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_wavefront_one_as_release_store: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_wavefront_one_as_release_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { entry: store atomic i32 %in, ptr addrspace(1) %out syncscope("wavefront-one-as") release, align 4 @@ -11792,15 +11837,16 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_store( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_wavefront_one_as_seq_cst_store: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_wavefront_one_as_seq_cst_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { entry: store atomic i32 %in, ptr addrspace(1) %out syncscope("wavefront-one-as") seq_cst, align 4 @@ -11956,15 +12002,16 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_atomicrmw( ; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_wavefront_one_as_monotonic_atomicrmw: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_wavefront_one_as_monotonic_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("wavefront-one-as") monotonic @@ -12120,15 +12167,16 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_atomicrmw( ; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_wavefront_one_as_acquire_atomicrmw: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_wavefront_one_as_acquire_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("wavefront-one-as") acquire @@ -12284,15 +12332,16 @@ define amdgpu_kernel void @global_wavefront_one_as_release_atomicrmw( ; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_wavefront_one_as_release_atomicrmw: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_wavefront_one_as_release_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("wavefront-one-as") release @@ -12448,15 +12497,16 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_atomicrmw( ; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_wavefront_one_as_acq_rel_atomicrmw: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_wavefront_one_as_acq_rel_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("wavefront-one-as") acq_rel @@ -12612,15 +12662,16 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_atomicrmw( ; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_wavefront_one_as_seq_cst_atomicrmw: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_wavefront_one_as_seq_cst_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("wavefront-one-as") seq_cst @@ -12804,17 +12855,18 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_ret_atomicrmw( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_wavefront_one_as_acquire_ret_atomicrmw: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_wavefront_one_as_acquire_ret_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("wavefront-one-as") acquire @@ -12999,17 +13051,18 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_ret_atomicrmw( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_wavefront_one_as_acq_rel_ret_atomicrmw: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_wavefront_one_as_acq_rel_ret_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("wavefront-one-as") acq_rel @@ -13194,17 +13247,18 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_ret_atomicrmw( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_wavefront_one_as_seq_cst_ret_atomicrmw: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_wavefront_one_as_seq_cst_ret_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("wavefront-one-as") seq_cst @@ -13425,19 +13479,20 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_monotonic_cmpxchg( ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_wavefront_one_as_monotonic_monotonic_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_wavefront_one_as_monotonic_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -13658,19 +13713,20 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_monotonic_cmpxchg( ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_wavefront_one_as_acquire_monotonic_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_wavefront_one_as_acquire_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -13891,19 +13947,20 @@ define amdgpu_kernel void @global_wavefront_one_as_release_monotonic_cmpxchg( ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_wavefront_one_as_release_monotonic_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_wavefront_one_as_release_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -14124,19 +14181,20 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_monotonic_cmpxchg( ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_wavefront_one_as_acq_rel_monotonic_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_wavefront_one_as_acq_rel_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -14357,19 +14415,20 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_monotonic_cmpxchg( ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_wavefront_one_as_seq_cst_monotonic_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_wavefront_one_as_seq_cst_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -14590,19 +14649,20 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_acquire_cmpxchg( ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_wavefront_one_as_monotonic_acquire_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_wavefront_one_as_monotonic_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -14823,19 +14883,20 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_acquire_cmpxchg( ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_wavefront_one_as_acquire_acquire_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_wavefront_one_as_acquire_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -15056,19 +15117,20 @@ define amdgpu_kernel void @global_wavefront_one_as_release_acquire_cmpxchg( ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_wavefront_one_as_release_acquire_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_wavefront_one_as_release_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -15289,19 +15351,20 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_acquire_cmpxchg( ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_wavefront_one_as_acq_rel_acquire_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_wavefront_one_as_acq_rel_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -15522,19 +15585,20 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_acquire_cmpxchg( ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_wavefront_one_as_seq_cst_acquire_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_wavefront_one_as_seq_cst_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -15755,19 +15819,20 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_seq_cst_cmpxchg( ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_wavefront_one_as_monotonic_seq_cst_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_wavefront_one_as_monotonic_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -15988,19 +16053,20 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_seq_cst_cmpxchg( ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_wavefront_one_as_acquire_seq_cst_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_wavefront_one_as_acquire_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -16221,19 +16287,20 @@ define amdgpu_kernel void @global_wavefront_one_as_release_seq_cst_cmpxchg( ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_wavefront_one_as_release_seq_cst_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_wavefront_one_as_release_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -16454,19 +16521,20 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_seq_cst_cmpxchg( ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_wavefront_one_as_acq_rel_seq_cst_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_wavefront_one_as_acq_rel_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -16687,19 +16755,20 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_seq_cst_cmpxchg( ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_wavefront_one_as_seq_cst_seq_cst_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_wavefront_one_as_seq_cst_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -16950,21 +17019,22 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_monotonic_ret_cmpxc ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_wavefront_one_as_monotonic_monotonic_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_wavefront_one_as_monotonic_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -17217,21 +17287,22 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_monotonic_ret_cmpxchg ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_wavefront_one_as_acquire_monotonic_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_wavefront_one_as_acquire_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -17484,21 +17555,22 @@ define amdgpu_kernel void @global_wavefront_one_as_release_monotonic_ret_cmpxchg ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_wavefront_one_as_release_monotonic_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_wavefront_one_as_release_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -17751,21 +17823,22 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -18018,21 +18091,22 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -18285,21 +18359,22 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_acquire_ret_cmpxchg ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_wavefront_one_as_monotonic_acquire_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_wavefront_one_as_monotonic_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -18552,21 +18627,22 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_wavefront_one_as_acquire_acquire_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_wavefront_one_as_acquire_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -18819,21 +18895,22 @@ define amdgpu_kernel void @global_wavefront_one_as_release_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_wavefront_one_as_release_acquire_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_wavefront_one_as_release_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -19086,21 +19163,22 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_wavefront_one_as_acq_rel_acquire_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_wavefront_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -19353,21 +19431,22 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_wavefront_one_as_seq_cst_acquire_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_wavefront_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -19620,21 +19699,22 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -19887,21 +19967,22 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_wavefront_one_as_acquire_seq_cst_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_wavefront_one_as_acquire_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -20154,21 +20235,22 @@ define amdgpu_kernel void @global_wavefront_one_as_release_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_wavefront_one_as_release_seq_cst_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_wavefront_one_as_release_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -20421,21 +20503,22 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -20688,21 +20771,22 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-workgroup.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-workgroup.ll index 3eab16e6b9713..bbbf8cf7f5cb1 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-workgroup.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-workgroup.ll @@ -799,17 +799,18 @@ define amdgpu_kernel void @global_workgroup_seq_cst_load( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_workgroup_seq_cst_load: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-NEXT: s_wait_dscnt 0x0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: global_load_b32 v1, v0, s[2:3] -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_workgroup_seq_cst_load: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { entry: %val = load atomic i32, ptr addrspace(1) %in syncscope("workgroup") seq_cst, align 4 @@ -968,15 +969,16 @@ define amdgpu_kernel void @global_workgroup_unordered_store( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_workgroup_unordered_store: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_workgroup_unordered_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { entry: store atomic i32 %in, ptr addrspace(1) %out syncscope("workgroup") unordered, align 4 @@ -1134,15 +1136,16 @@ define amdgpu_kernel void @global_workgroup_monotonic_store( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_workgroup_monotonic_store: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_workgroup_monotonic_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { entry: store atomic i32 %in, ptr addrspace(1) %out syncscope("workgroup") monotonic, align 4 @@ -1318,16 +1321,18 @@ define amdgpu_kernel void @global_workgroup_release_store( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_workgroup_release_store: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: s_wait_dscnt 0x0 -; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_workgroup_release_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { entry: store atomic i32 %in, ptr addrspace(1) %out syncscope("workgroup") release, align 4 @@ -1503,16 +1508,18 @@ define amdgpu_kernel void @global_workgroup_seq_cst_store( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_workgroup_seq_cst_store: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: s_wait_dscnt 0x0 -; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_workgroup_seq_cst_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { entry: store atomic i32 %in, ptr addrspace(1) %out syncscope("workgroup") seq_cst, align 4 @@ -1668,15 +1675,16 @@ define amdgpu_kernel void @global_workgroup_monotonic_atomicrmw( ; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_workgroup_monotonic_atomicrmw: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_workgroup_monotonic_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("workgroup") monotonic @@ -1842,15 +1850,17 @@ define amdgpu_kernel void @global_workgroup_acquire_atomicrmw( ; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_workgroup_acquire_atomicrmw: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_workgroup_acquire_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("workgroup") acquire @@ -2024,16 +2034,18 @@ define amdgpu_kernel void @global_workgroup_release_atomicrmw( ; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_workgroup_release_atomicrmw: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: s_wait_dscnt 0x0 -; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_workgroup_release_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("workgroup") release @@ -2217,16 +2229,19 @@ define amdgpu_kernel void @global_workgroup_acq_rel_atomicrmw( ; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_workgroup_acq_rel_atomicrmw: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: s_wait_dscnt 0x0 -; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_workgroup_acq_rel_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("workgroup") acq_rel @@ -2410,16 +2425,19 @@ define amdgpu_kernel void @global_workgroup_seq_cst_atomicrmw( ; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_workgroup_seq_cst_atomicrmw: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: s_wait_dscnt 0x0 -; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_workgroup_seq_cst_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("workgroup") seq_cst @@ -2608,17 +2626,18 @@ define amdgpu_kernel void @global_workgroup_acquire_ret_atomicrmw( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_workgroup_acquire_ret_atomicrmw: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_workgroup_acquire_ret_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("workgroup") acquire @@ -2828,18 +2847,20 @@ define amdgpu_kernel void @global_workgroup_acq_rel_ret_atomicrmw( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_workgroup_acq_rel_ret_atomicrmw: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: s_wait_dscnt 0x0 -; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_workgroup_acq_rel_ret_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("workgroup") acq_rel @@ -3049,18 +3070,20 @@ define amdgpu_kernel void @global_workgroup_seq_cst_ret_atomicrmw( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_workgroup_seq_cst_ret_atomicrmw: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: s_wait_dscnt 0x0 -; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_workgroup_seq_cst_ret_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("workgroup") seq_cst @@ -3281,19 +3304,20 @@ define amdgpu_kernel void @global_workgroup_monotonic_monotonic_cmpxchg( ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_workgroup_monotonic_monotonic_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_workgroup_monotonic_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -3524,19 +3548,21 @@ define amdgpu_kernel void @global_workgroup_acquire_monotonic_cmpxchg( ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_workgroup_acquire_monotonic_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_workgroup_acquire_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -3775,20 +3801,22 @@ define amdgpu_kernel void @global_workgroup_release_monotonic_cmpxchg( ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_workgroup_release_monotonic_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: s_wait_dscnt 0x0 -; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_workgroup_release_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -4037,20 +4065,23 @@ define amdgpu_kernel void @global_workgroup_acq_rel_monotonic_cmpxchg( ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_workgroup_acq_rel_monotonic_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: s_wait_dscnt 0x0 -; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_workgroup_acq_rel_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -4299,20 +4330,23 @@ define amdgpu_kernel void @global_workgroup_seq_cst_monotonic_cmpxchg( ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_workgroup_seq_cst_monotonic_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: s_wait_dscnt 0x0 -; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_workgroup_seq_cst_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -4543,19 +4577,21 @@ define amdgpu_kernel void @global_workgroup_monotonic_acquire_cmpxchg( ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_workgroup_monotonic_acquire_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_workgroup_monotonic_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -4786,19 +4822,21 @@ define amdgpu_kernel void @global_workgroup_acquire_acquire_cmpxchg( ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_workgroup_acquire_acquire_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_workgroup_acquire_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -5047,20 +5085,23 @@ define amdgpu_kernel void @global_workgroup_release_acquire_cmpxchg( ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_workgroup_release_acquire_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: s_wait_dscnt 0x0 -; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_workgroup_release_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -5309,20 +5350,23 @@ define amdgpu_kernel void @global_workgroup_acq_rel_acquire_cmpxchg( ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_workgroup_acq_rel_acquire_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: s_wait_dscnt 0x0 -; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_workgroup_acq_rel_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -5571,20 +5615,23 @@ define amdgpu_kernel void @global_workgroup_seq_cst_acquire_cmpxchg( ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_workgroup_seq_cst_acquire_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: s_wait_dscnt 0x0 -; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_workgroup_seq_cst_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -5833,20 +5880,23 @@ define amdgpu_kernel void @global_workgroup_monotonic_seq_cst_cmpxchg( ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_workgroup_monotonic_seq_cst_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: s_wait_dscnt 0x0 -; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_workgroup_monotonic_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -6095,20 +6145,23 @@ define amdgpu_kernel void @global_workgroup_acquire_seq_cst_cmpxchg( ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_workgroup_acquire_seq_cst_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: s_wait_dscnt 0x0 -; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_workgroup_acquire_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -6357,20 +6410,23 @@ define amdgpu_kernel void @global_workgroup_release_seq_cst_cmpxchg( ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_workgroup_release_seq_cst_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: s_wait_dscnt 0x0 -; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_workgroup_release_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -6619,20 +6675,23 @@ define amdgpu_kernel void @global_workgroup_acq_rel_seq_cst_cmpxchg( ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_workgroup_acq_rel_seq_cst_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: s_wait_dscnt 0x0 -; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_workgroup_acq_rel_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -6881,20 +6940,23 @@ define amdgpu_kernel void @global_workgroup_seq_cst_seq_cst_cmpxchg( ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_workgroup_seq_cst_seq_cst_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: s_wait_dscnt 0x0 -; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_workgroup_seq_cst_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -7145,21 +7207,22 @@ define amdgpu_kernel void @global_workgroup_monotonic_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_workgroup_monotonic_monotonic_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_workgroup_monotonic_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -7417,21 +7480,22 @@ define amdgpu_kernel void @global_workgroup_acquire_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_workgroup_acquire_monotonic_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_workgroup_acquire_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -7702,22 +7766,24 @@ define amdgpu_kernel void @global_workgroup_release_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_workgroup_release_monotonic_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: s_wait_dscnt 0x0 -; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_workgroup_release_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -7995,22 +8061,24 @@ define amdgpu_kernel void @global_workgroup_acq_rel_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_workgroup_acq_rel_monotonic_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: s_wait_dscnt 0x0 -; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_workgroup_acq_rel_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -8288,22 +8356,24 @@ define amdgpu_kernel void @global_workgroup_seq_cst_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_workgroup_seq_cst_monotonic_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: s_wait_dscnt 0x0 -; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_workgroup_seq_cst_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -8563,21 +8633,22 @@ define amdgpu_kernel void @global_workgroup_monotonic_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_workgroup_monotonic_acquire_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_workgroup_monotonic_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -8835,21 +8906,22 @@ define amdgpu_kernel void @global_workgroup_acquire_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_workgroup_acquire_acquire_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_workgroup_acquire_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -9127,22 +9199,24 @@ define amdgpu_kernel void @global_workgroup_release_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_workgroup_release_acquire_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: s_wait_dscnt 0x0 -; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_workgroup_release_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -9420,22 +9494,24 @@ define amdgpu_kernel void @global_workgroup_acq_rel_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_workgroup_acq_rel_acquire_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: s_wait_dscnt 0x0 -; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_workgroup_acq_rel_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -9713,22 +9789,24 @@ define amdgpu_kernel void @global_workgroup_seq_cst_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_workgroup_seq_cst_acquire_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: s_wait_dscnt 0x0 -; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_workgroup_seq_cst_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -10006,22 +10084,24 @@ define amdgpu_kernel void @global_workgroup_monotonic_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_workgroup_monotonic_seq_cst_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: s_wait_dscnt 0x0 -; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_workgroup_monotonic_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -10297,22 +10377,24 @@ define amdgpu_kernel void @global_workgroup_acquire_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_workgroup_acquire_seq_cst_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: s_wait_dscnt 0x0 -; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_workgroup_acquire_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -10590,22 +10672,24 @@ define amdgpu_kernel void @global_workgroup_release_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_workgroup_release_seq_cst_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: s_wait_dscnt 0x0 -; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_workgroup_release_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -10883,22 +10967,24 @@ define amdgpu_kernel void @global_workgroup_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_workgroup_acq_rel_seq_cst_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: s_wait_dscnt 0x0 -; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_workgroup_acq_rel_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -11176,22 +11262,24 @@ define amdgpu_kernel void @global_workgroup_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_workgroup_seq_cst_seq_cst_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: s_wait_dscnt 0x0 -; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_workgroup_seq_cst_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -11982,16 +12070,18 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_load( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_workgroup_one_as_seq_cst_load: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: global_load_b32 v1, v0, s[2:3] -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_workgroup_one_as_seq_cst_load: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { entry: %val = load atomic i32, ptr addrspace(1) %in syncscope("workgroup-one-as") seq_cst, align 4 @@ -12150,15 +12240,16 @@ define amdgpu_kernel void @global_workgroup_one_as_unordered_store( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_workgroup_one_as_unordered_store: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_workgroup_one_as_unordered_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { entry: store atomic i32 %in, ptr addrspace(1) %out syncscope("workgroup-one-as") unordered, align 4 @@ -12316,15 +12407,16 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_store( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_workgroup_one_as_monotonic_store: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_workgroup_one_as_monotonic_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { entry: store atomic i32 %in, ptr addrspace(1) %out syncscope("workgroup-one-as") monotonic, align 4 @@ -12492,15 +12584,18 @@ define amdgpu_kernel void @global_workgroup_one_as_release_store( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_workgroup_one_as_release_store: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_workgroup_one_as_release_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { entry: store atomic i32 %in, ptr addrspace(1) %out syncscope("workgroup-one-as") release, align 4 @@ -12668,15 +12763,18 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_store( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_workgroup_one_as_seq_cst_store: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_workgroup_one_as_seq_cst_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { entry: store atomic i32 %in, ptr addrspace(1) %out syncscope("workgroup-one-as") seq_cst, align 4 @@ -12832,15 +12930,16 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_atomicrmw( ; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_workgroup_one_as_monotonic_atomicrmw: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_workgroup_one_as_monotonic_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("workgroup-one-as") monotonic @@ -13006,15 +13105,17 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_atomicrmw( ; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_workgroup_one_as_acquire_atomicrmw: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_workgroup_one_as_acquire_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("workgroup-one-as") acquire @@ -13180,15 +13281,18 @@ define amdgpu_kernel void @global_workgroup_one_as_release_atomicrmw( ; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_workgroup_one_as_release_atomicrmw: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_workgroup_one_as_release_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("workgroup-one-as") release @@ -13364,15 +13468,19 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_atomicrmw( ; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_workgroup_one_as_acq_rel_atomicrmw: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_workgroup_one_as_acq_rel_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("workgroup-one-as") acq_rel @@ -13548,15 +13656,19 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_atomicrmw( ; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_workgroup_one_as_seq_cst_atomicrmw: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_workgroup_one_as_seq_cst_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("workgroup-one-as") seq_cst @@ -13745,17 +13857,18 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_ret_atomicrmw( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_workgroup_one_as_acquire_ret_atomicrmw: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_workgroup_one_as_acquire_ret_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("workgroup-one-as") acquire @@ -13957,17 +14070,20 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_ret_atomicrmw( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_workgroup_one_as_acq_rel_ret_atomicrmw: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_workgroup_one_as_acq_rel_ret_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("workgroup-one-as") acq_rel @@ -14169,17 +14285,20 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_ret_atomicrmw( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_workgroup_one_as_seq_cst_ret_atomicrmw: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_workgroup_one_as_seq_cst_ret_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("workgroup-one-as") seq_cst @@ -14400,19 +14519,20 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_monotonic_cmpxchg( ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_workgroup_one_as_monotonic_monotonic_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_workgroup_one_as_monotonic_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -14643,19 +14763,21 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_monotonic_cmpxchg( ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_workgroup_one_as_acquire_monotonic_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_workgroup_one_as_acquire_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -14886,19 +15008,22 @@ define amdgpu_kernel void @global_workgroup_one_as_release_monotonic_cmpxchg( ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_workgroup_one_as_release_monotonic_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_workgroup_one_as_release_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -15139,19 +15264,23 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_monotonic_cmpxchg( ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_workgroup_one_as_acq_rel_monotonic_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_workgroup_one_as_acq_rel_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -15392,19 +15521,23 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_monotonic_cmpxchg( ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_workgroup_one_as_seq_cst_monotonic_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_workgroup_one_as_seq_cst_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -15635,19 +15768,21 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_acquire_cmpxchg( ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_workgroup_one_as_monotonic_acquire_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_workgroup_one_as_monotonic_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -15878,19 +16013,21 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_acquire_cmpxchg( ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_workgroup_one_as_acquire_acquire_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_workgroup_one_as_acquire_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -16131,19 +16268,23 @@ define amdgpu_kernel void @global_workgroup_one_as_release_acquire_cmpxchg( ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_workgroup_one_as_release_acquire_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_workgroup_one_as_release_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -16384,19 +16525,23 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_acquire_cmpxchg( ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_workgroup_one_as_acq_rel_acquire_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_workgroup_one_as_acq_rel_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -16637,19 +16782,23 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_acquire_cmpxchg( ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_workgroup_one_as_seq_cst_acquire_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_workgroup_one_as_seq_cst_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -16890,19 +17039,23 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_seq_cst_cmpxchg( ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_workgroup_one_as_monotonic_seq_cst_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_workgroup_one_as_monotonic_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -17143,19 +17296,23 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_seq_cst_cmpxchg( ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_workgroup_one_as_acquire_seq_cst_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_workgroup_one_as_acquire_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -17396,19 +17553,23 @@ define amdgpu_kernel void @global_workgroup_one_as_release_seq_cst_cmpxchg( ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_workgroup_one_as_release_seq_cst_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_workgroup_one_as_release_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -17649,19 +17810,23 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_seq_cst_cmpxchg( ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_workgroup_one_as_acq_rel_seq_cst_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_workgroup_one_as_acq_rel_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -17902,19 +18067,23 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_seq_cst_cmpxchg( ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_workgroup_one_as_seq_cst_seq_cst_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_workgroup_one_as_seq_cst_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -18165,21 +18334,22 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_monotonic_ret_cmpxc ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_workgroup_one_as_monotonic_monotonic_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_workgroup_one_as_monotonic_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -18437,21 +18607,22 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_monotonic_ret_cmpxchg ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_workgroup_one_as_acquire_monotonic_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_workgroup_one_as_acquire_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -18714,21 +18885,24 @@ define amdgpu_kernel void @global_workgroup_one_as_release_monotonic_ret_cmpxchg ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_workgroup_one_as_release_monotonic_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_workgroup_one_as_release_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -18998,21 +19172,24 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -19282,21 +19459,24 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -19556,21 +19736,22 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_acquire_ret_cmpxchg ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_workgroup_one_as_monotonic_acquire_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_workgroup_one_as_monotonic_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -19828,21 +20009,22 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_workgroup_one_as_acquire_acquire_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_workgroup_one_as_acquire_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -20112,21 +20294,24 @@ define amdgpu_kernel void @global_workgroup_one_as_release_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_workgroup_one_as_release_acquire_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_workgroup_one_as_release_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -20396,21 +20581,24 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_workgroup_one_as_acq_rel_acquire_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_workgroup_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -20680,21 +20868,24 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_workgroup_one_as_seq_cst_acquire_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_workgroup_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -20964,21 +21155,24 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -21246,21 +21440,24 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_workgroup_one_as_acquire_seq_cst_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_workgroup_one_as_acquire_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -21530,21 +21727,24 @@ define amdgpu_kernel void @global_workgroup_one_as_release_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_workgroup_one_as_release_seq_cst_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_workgroup_one_as_release_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -21814,21 +22014,24 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -22098,21 +22301,24 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: global_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: global_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-agent.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-agent.ll index 102616b9a2065..7428ddc780675 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-agent.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-agent.ll @@ -756,18 +756,19 @@ define amdgpu_kernel void @local_agent_seq_cst_load( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: local_agent_seq_cst_load: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s1 -; GFX1250-NEXT: s_wait_dscnt 0x0 -; GFX1250-NEXT: ds_load_b32 v1, v0 -; GFX1250-NEXT: s_wait_dscnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-NEXT: ds_store_b32 v0, v1 -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: local_agent_seq_cst_load: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: ds_load_b32 v1, v0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %in, ptr addrspace(3) %out) { entry: %val = load atomic i32, ptr addrspace(3) %in syncscope("agent") seq_cst, align 4 @@ -1228,16 +1229,17 @@ define amdgpu_kernel void @local_agent_release_store( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: local_agent_release_store: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s1 -; GFX1250-NEXT: v_mov_b32_e32 v1, s0 -; GFX1250-NEXT: s_wait_dscnt 0x0 -; GFX1250-NEXT: ds_store_b32 v0, v1 -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: local_agent_release_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr addrspace(3) %out) { entry: store atomic i32 %in, ptr addrspace(3) %out syncscope("agent") release, align 4 @@ -1397,16 +1399,17 @@ define amdgpu_kernel void @local_agent_seq_cst_store( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: local_agent_seq_cst_store: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s1 -; GFX1250-NEXT: v_mov_b32_e32 v1, s0 -; GFX1250-NEXT: s_wait_dscnt 0x0 -; GFX1250-NEXT: ds_store_b32 v0, v1 -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: local_agent_seq_cst_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr addrspace(3) %out) { entry: store atomic i32 %in, ptr addrspace(3) %out syncscope("agent") seq_cst, align 4 @@ -1883,16 +1886,17 @@ define amdgpu_kernel void @local_agent_release_atomicrmw( ; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: local_agent_release_atomicrmw: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s1 -; GFX1250-NEXT: v_mov_b32_e32 v1, s0 -; GFX1250-NEXT: s_wait_dscnt 0x0 -; GFX1250-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: local_agent_release_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("agent") release @@ -2068,17 +2072,18 @@ define amdgpu_kernel void @local_agent_acq_rel_atomicrmw( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: local_agent_acq_rel_atomicrmw: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s1 -; GFX1250-NEXT: v_mov_b32_e32 v1, s0 -; GFX1250-NEXT: s_wait_dscnt 0x0 -; GFX1250-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 -; GFX1250-NEXT: s_wait_dscnt 0x0 -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: local_agent_acq_rel_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("agent") acq_rel @@ -2254,17 +2259,18 @@ define amdgpu_kernel void @local_agent_seq_cst_atomicrmw( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: local_agent_seq_cst_atomicrmw: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s1 -; GFX1250-NEXT: v_mov_b32_e32 v1, s0 -; GFX1250-NEXT: s_wait_dscnt 0x0 -; GFX1250-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 -; GFX1250-NEXT: s_wait_dscnt 0x0 -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: local_agent_seq_cst_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("agent") seq_cst @@ -2672,19 +2678,20 @@ define amdgpu_kernel void @local_agent_acq_rel_ret_atomicrmw( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: local_agent_acq_rel_ret_atomicrmw: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-NEXT: s_wait_dscnt 0x0 -; GFX1250-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 -; GFX1250-NEXT: s_wait_dscnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-NEXT: ds_store_b32 v0, v1 -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: local_agent_acq_rel_ret_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("agent") acq_rel @@ -2892,19 +2899,20 @@ define amdgpu_kernel void @local_agent_seq_cst_ret_atomicrmw( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: local_agent_seq_cst_ret_atomicrmw: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-NEXT: s_wait_dscnt 0x0 -; GFX1250-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 -; GFX1250-NEXT: s_wait_dscnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-NEXT: ds_store_b32 v0, v1 -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: local_agent_seq_cst_ret_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("agent") seq_cst @@ -3466,18 +3474,19 @@ define amdgpu_kernel void @local_agent_release_monotonic_cmpxchg( ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: local_agent_release_monotonic_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-NEXT: s_wait_dscnt 0x0 -; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: local_agent_release_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -3680,19 +3689,20 @@ define amdgpu_kernel void @local_agent_acq_rel_monotonic_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: local_agent_acq_rel_monotonic_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-NEXT: s_wait_dscnt 0x0 -; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 -; GFX1250-NEXT: s_wait_dscnt 0x0 -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: local_agent_acq_rel_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -3895,19 +3905,20 @@ define amdgpu_kernel void @local_agent_seq_cst_monotonic_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: local_agent_seq_cst_monotonic_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-NEXT: s_wait_dscnt 0x0 -; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 -; GFX1250-NEXT: s_wait_dscnt 0x0 -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: local_agent_seq_cst_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -4502,19 +4513,20 @@ define amdgpu_kernel void @local_agent_release_acquire_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: local_agent_release_acquire_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-NEXT: s_wait_dscnt 0x0 -; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 -; GFX1250-NEXT: s_wait_dscnt 0x0 -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: local_agent_release_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -4717,19 +4729,20 @@ define amdgpu_kernel void @local_agent_acq_rel_acquire_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: local_agent_acq_rel_acquire_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-NEXT: s_wait_dscnt 0x0 -; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 -; GFX1250-NEXT: s_wait_dscnt 0x0 -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: local_agent_acq_rel_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -4932,19 +4945,20 @@ define amdgpu_kernel void @local_agent_seq_cst_acquire_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: local_agent_seq_cst_acquire_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-NEXT: s_wait_dscnt 0x0 -; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 -; GFX1250-NEXT: s_wait_dscnt 0x0 -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: local_agent_seq_cst_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -5147,19 +5161,20 @@ define amdgpu_kernel void @local_agent_monotonic_seq_cst_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: local_agent_monotonic_seq_cst_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-NEXT: s_wait_dscnt 0x0 -; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 -; GFX1250-NEXT: s_wait_dscnt 0x0 -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: local_agent_monotonic_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -5362,19 +5377,20 @@ define amdgpu_kernel void @local_agent_acquire_seq_cst_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: local_agent_acquire_seq_cst_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-NEXT: s_wait_dscnt 0x0 -; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 -; GFX1250-NEXT: s_wait_dscnt 0x0 -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: local_agent_acquire_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -5577,19 +5593,20 @@ define amdgpu_kernel void @local_agent_release_seq_cst_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: local_agent_release_seq_cst_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-NEXT: s_wait_dscnt 0x0 -; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 -; GFX1250-NEXT: s_wait_dscnt 0x0 -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: local_agent_release_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -5792,19 +5809,20 @@ define amdgpu_kernel void @local_agent_acq_rel_seq_cst_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: local_agent_acq_rel_seq_cst_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-NEXT: s_wait_dscnt 0x0 -; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 -; GFX1250-NEXT: s_wait_dscnt 0x0 -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: local_agent_acq_rel_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -6007,19 +6025,20 @@ define amdgpu_kernel void @local_agent_seq_cst_seq_cst_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: local_agent_seq_cst_seq_cst_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-NEXT: s_wait_dscnt 0x0 -; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 -; GFX1250-NEXT: s_wait_dscnt 0x0 -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: local_agent_seq_cst_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -6705,21 +6724,22 @@ define amdgpu_kernel void @local_agent_release_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: local_agent_release_monotonic_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 -; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-NEXT: s_wait_dscnt 0x0 -; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX1250-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-NEXT: s_wait_dscnt 0x0 -; GFX1250-NEXT: ds_store_b32 v0, v1 -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: local_agent_release_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -6955,21 +6975,22 @@ define amdgpu_kernel void @local_agent_acq_rel_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: local_agent_acq_rel_monotonic_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 -; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-NEXT: s_wait_dscnt 0x0 -; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX1250-NEXT: s_wait_dscnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-NEXT: ds_store_b32 v0, v1 -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: local_agent_acq_rel_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -7205,21 +7226,22 @@ define amdgpu_kernel void @local_agent_seq_cst_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: local_agent_seq_cst_monotonic_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 -; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-NEXT: s_wait_dscnt 0x0 -; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX1250-NEXT: s_wait_dscnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-NEXT: ds_store_b32 v0, v1 -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: local_agent_seq_cst_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -7917,21 +7939,22 @@ define amdgpu_kernel void @local_agent_release_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: local_agent_release_acquire_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 -; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-NEXT: s_wait_dscnt 0x0 -; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX1250-NEXT: s_wait_dscnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-NEXT: ds_store_b32 v0, v1 -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: local_agent_release_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -8167,21 +8190,22 @@ define amdgpu_kernel void @local_agent_acq_rel_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: local_agent_acq_rel_acquire_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 -; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-NEXT: s_wait_dscnt 0x0 -; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX1250-NEXT: s_wait_dscnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-NEXT: ds_store_b32 v0, v1 -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: local_agent_acq_rel_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -8417,21 +8441,22 @@ define amdgpu_kernel void @local_agent_seq_cst_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: local_agent_seq_cst_acquire_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 -; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-NEXT: s_wait_dscnt 0x0 -; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX1250-NEXT: s_wait_dscnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-NEXT: ds_store_b32 v0, v1 -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: local_agent_seq_cst_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -8667,21 +8692,22 @@ define amdgpu_kernel void @local_agent_monotonic_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: local_agent_monotonic_seq_cst_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 -; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-NEXT: s_wait_dscnt 0x0 -; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX1250-NEXT: s_wait_dscnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-NEXT: ds_store_b32 v0, v1 -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: local_agent_monotonic_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -8917,21 +8943,22 @@ define amdgpu_kernel void @local_agent_acquire_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: local_agent_acquire_seq_cst_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 -; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-NEXT: s_wait_dscnt 0x0 -; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX1250-NEXT: s_wait_dscnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-NEXT: ds_store_b32 v0, v1 -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: local_agent_acquire_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -9167,21 +9194,22 @@ define amdgpu_kernel void @local_agent_release_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: local_agent_release_seq_cst_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 -; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-NEXT: s_wait_dscnt 0x0 -; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX1250-NEXT: s_wait_dscnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-NEXT: ds_store_b32 v0, v1 -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: local_agent_release_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -9417,21 +9445,22 @@ define amdgpu_kernel void @local_agent_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: local_agent_acq_rel_seq_cst_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 -; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-NEXT: s_wait_dscnt 0x0 -; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX1250-NEXT: s_wait_dscnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-NEXT: ds_store_b32 v0, v1 -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: local_agent_acq_rel_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -9667,21 +9696,22 @@ define amdgpu_kernel void @local_agent_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: local_agent_seq_cst_seq_cst_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 -; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-NEXT: s_wait_dscnt 0x0 -; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX1250-NEXT: s_wait_dscnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-NEXT: ds_store_b32 v0, v1 -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: local_agent_seq_cst_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-system.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-system.ll index 1356fe4854170..d57736ba0230c 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-system.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-system.ll @@ -756,18 +756,19 @@ define amdgpu_kernel void @local_system_seq_cst_load( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: local_system_seq_cst_load: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s1 -; GFX1250-NEXT: s_wait_dscnt 0x0 -; GFX1250-NEXT: ds_load_b32 v1, v0 -; GFX1250-NEXT: s_wait_dscnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-NEXT: ds_store_b32 v0, v1 -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: local_system_seq_cst_load: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: ds_load_b32 v1, v0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %in, ptr addrspace(3) %out) { entry: %val = load atomic i32, ptr addrspace(3) %in seq_cst, align 4 @@ -1228,16 +1229,17 @@ define amdgpu_kernel void @local_system_release_store( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: local_system_release_store: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s1 -; GFX1250-NEXT: v_mov_b32_e32 v1, s0 -; GFX1250-NEXT: s_wait_dscnt 0x0 -; GFX1250-NEXT: ds_store_b32 v0, v1 -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: local_system_release_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr addrspace(3) %out) { entry: store atomic i32 %in, ptr addrspace(3) %out release, align 4 @@ -1397,16 +1399,17 @@ define amdgpu_kernel void @local_system_seq_cst_store( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: local_system_seq_cst_store: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s1 -; GFX1250-NEXT: v_mov_b32_e32 v1, s0 -; GFX1250-NEXT: s_wait_dscnt 0x0 -; GFX1250-NEXT: ds_store_b32 v0, v1 -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: local_system_seq_cst_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr addrspace(3) %out) { entry: store atomic i32 %in, ptr addrspace(3) %out seq_cst, align 4 @@ -1883,16 +1886,17 @@ define amdgpu_kernel void @local_system_release_atomicrmw( ; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: local_system_release_atomicrmw: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s1 -; GFX1250-NEXT: v_mov_b32_e32 v1, s0 -; GFX1250-NEXT: s_wait_dscnt 0x0 -; GFX1250-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: local_system_release_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in release @@ -2068,17 +2072,18 @@ define amdgpu_kernel void @local_system_acq_rel_atomicrmw( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: local_system_acq_rel_atomicrmw: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s1 -; GFX1250-NEXT: v_mov_b32_e32 v1, s0 -; GFX1250-NEXT: s_wait_dscnt 0x0 -; GFX1250-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 -; GFX1250-NEXT: s_wait_dscnt 0x0 -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: local_system_acq_rel_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in acq_rel @@ -2254,17 +2259,18 @@ define amdgpu_kernel void @local_system_seq_cst_atomicrmw( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: local_system_seq_cst_atomicrmw: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s1 -; GFX1250-NEXT: v_mov_b32_e32 v1, s0 -; GFX1250-NEXT: s_wait_dscnt 0x0 -; GFX1250-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 -; GFX1250-NEXT: s_wait_dscnt 0x0 -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: local_system_seq_cst_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in seq_cst @@ -2672,19 +2678,20 @@ define amdgpu_kernel void @local_system_acq_rel_ret_atomicrmw( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: local_system_acq_rel_ret_atomicrmw: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-NEXT: s_wait_dscnt 0x0 -; GFX1250-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 -; GFX1250-NEXT: s_wait_dscnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-NEXT: ds_store_b32 v0, v1 -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: local_system_acq_rel_ret_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in acq_rel @@ -2892,19 +2899,20 @@ define amdgpu_kernel void @local_system_seq_cst_ret_atomicrmw( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: local_system_seq_cst_ret_atomicrmw: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-NEXT: s_wait_dscnt 0x0 -; GFX1250-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 -; GFX1250-NEXT: s_wait_dscnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-NEXT: ds_store_b32 v0, v1 -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: local_system_seq_cst_ret_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in seq_cst @@ -3466,18 +3474,19 @@ define amdgpu_kernel void @local_system_release_monotonic_cmpxchg( ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: local_system_release_monotonic_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-NEXT: s_wait_dscnt 0x0 -; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: local_system_release_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -3680,19 +3689,20 @@ define amdgpu_kernel void @local_system_acq_rel_monotonic_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: local_system_acq_rel_monotonic_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-NEXT: s_wait_dscnt 0x0 -; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 -; GFX1250-NEXT: s_wait_dscnt 0x0 -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: local_system_acq_rel_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -3895,19 +3905,20 @@ define amdgpu_kernel void @local_system_seq_cst_monotonic_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: local_system_seq_cst_monotonic_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-NEXT: s_wait_dscnt 0x0 -; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 -; GFX1250-NEXT: s_wait_dscnt 0x0 -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: local_system_seq_cst_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -4502,19 +4513,20 @@ define amdgpu_kernel void @local_system_release_acquire_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: local_system_release_acquire_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-NEXT: s_wait_dscnt 0x0 -; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 -; GFX1250-NEXT: s_wait_dscnt 0x0 -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: local_system_release_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -4717,19 +4729,20 @@ define amdgpu_kernel void @local_system_acq_rel_acquire_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: local_system_acq_rel_acquire_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-NEXT: s_wait_dscnt 0x0 -; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 -; GFX1250-NEXT: s_wait_dscnt 0x0 -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: local_system_acq_rel_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -4932,19 +4945,20 @@ define amdgpu_kernel void @local_system_seq_cst_acquire_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: local_system_seq_cst_acquire_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-NEXT: s_wait_dscnt 0x0 -; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 -; GFX1250-NEXT: s_wait_dscnt 0x0 -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: local_system_seq_cst_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -5147,19 +5161,20 @@ define amdgpu_kernel void @local_system_monotonic_seq_cst_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: local_system_monotonic_seq_cst_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-NEXT: s_wait_dscnt 0x0 -; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 -; GFX1250-NEXT: s_wait_dscnt 0x0 -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: local_system_monotonic_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -5362,19 +5377,20 @@ define amdgpu_kernel void @local_system_acquire_seq_cst_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: local_system_acquire_seq_cst_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-NEXT: s_wait_dscnt 0x0 -; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 -; GFX1250-NEXT: s_wait_dscnt 0x0 -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: local_system_acquire_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -5577,19 +5593,20 @@ define amdgpu_kernel void @local_system_release_seq_cst_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: local_system_release_seq_cst_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-NEXT: s_wait_dscnt 0x0 -; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 -; GFX1250-NEXT: s_wait_dscnt 0x0 -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: local_system_release_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -5792,19 +5809,20 @@ define amdgpu_kernel void @local_system_acq_rel_seq_cst_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: local_system_acq_rel_seq_cst_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-NEXT: s_wait_dscnt 0x0 -; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 -; GFX1250-NEXT: s_wait_dscnt 0x0 -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: local_system_acq_rel_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -6007,19 +6025,20 @@ define amdgpu_kernel void @local_system_seq_cst_seq_cst_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: local_system_seq_cst_seq_cst_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-NEXT: s_wait_dscnt 0x0 -; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 -; GFX1250-NEXT: s_wait_dscnt 0x0 -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: local_system_seq_cst_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -6705,21 +6724,22 @@ define amdgpu_kernel void @local_system_release_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: local_system_release_monotonic_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 -; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-NEXT: s_wait_dscnt 0x0 -; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX1250-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-NEXT: s_wait_dscnt 0x0 -; GFX1250-NEXT: ds_store_b32 v0, v1 -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: local_system_release_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -6955,21 +6975,22 @@ define amdgpu_kernel void @local_system_acq_rel_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: local_system_acq_rel_monotonic_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 -; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-NEXT: s_wait_dscnt 0x0 -; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX1250-NEXT: s_wait_dscnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-NEXT: ds_store_b32 v0, v1 -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: local_system_acq_rel_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -7205,21 +7226,22 @@ define amdgpu_kernel void @local_system_seq_cst_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: local_system_seq_cst_monotonic_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 -; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-NEXT: s_wait_dscnt 0x0 -; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX1250-NEXT: s_wait_dscnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-NEXT: ds_store_b32 v0, v1 -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: local_system_seq_cst_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -7917,21 +7939,22 @@ define amdgpu_kernel void @local_system_release_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: local_system_release_acquire_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 -; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-NEXT: s_wait_dscnt 0x0 -; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX1250-NEXT: s_wait_dscnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-NEXT: ds_store_b32 v0, v1 -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: local_system_release_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -8167,21 +8190,22 @@ define amdgpu_kernel void @local_system_acq_rel_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: local_system_acq_rel_acquire_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 -; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-NEXT: s_wait_dscnt 0x0 -; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX1250-NEXT: s_wait_dscnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-NEXT: ds_store_b32 v0, v1 -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: local_system_acq_rel_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -8417,21 +8441,22 @@ define amdgpu_kernel void @local_system_seq_cst_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: local_system_seq_cst_acquire_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 -; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-NEXT: s_wait_dscnt 0x0 -; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX1250-NEXT: s_wait_dscnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-NEXT: ds_store_b32 v0, v1 -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: local_system_seq_cst_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -8667,21 +8692,22 @@ define amdgpu_kernel void @local_system_monotonic_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: local_system_monotonic_seq_cst_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 -; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-NEXT: s_wait_dscnt 0x0 -; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX1250-NEXT: s_wait_dscnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-NEXT: ds_store_b32 v0, v1 -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: local_system_monotonic_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -8917,21 +8943,22 @@ define amdgpu_kernel void @local_system_acquire_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: local_system_acquire_seq_cst_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 -; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-NEXT: s_wait_dscnt 0x0 -; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX1250-NEXT: s_wait_dscnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-NEXT: ds_store_b32 v0, v1 -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: local_system_acquire_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -9167,21 +9194,22 @@ define amdgpu_kernel void @local_system_release_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: local_system_release_seq_cst_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 -; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-NEXT: s_wait_dscnt 0x0 -; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX1250-NEXT: s_wait_dscnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-NEXT: ds_store_b32 v0, v1 -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: local_system_release_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -9417,21 +9445,22 @@ define amdgpu_kernel void @local_system_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: local_system_acq_rel_seq_cst_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 -; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-NEXT: s_wait_dscnt 0x0 -; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX1250-NEXT: s_wait_dscnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-NEXT: ds_store_b32 v0, v1 -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: local_system_acq_rel_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -9667,21 +9696,22 @@ define amdgpu_kernel void @local_system_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: local_system_seq_cst_seq_cst_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 -; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-NEXT: s_wait_dscnt 0x0 -; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX1250-NEXT: s_wait_dscnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-NEXT: ds_store_b32 v0, v1 -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: local_system_seq_cst_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll index 75e28f9008e28..d8ba02adf4b35 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll @@ -883,16 +883,17 @@ define amdgpu_kernel void @local_volatile_workgroup_release_store( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: local_volatile_workgroup_release_store: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s1 -; GFX1250-NEXT: v_mov_b32_e32 v1, s0 -; GFX1250-NEXT: s_wait_dscnt 0x0 -; GFX1250-NEXT: ds_store_b32 v0, v1 -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: local_volatile_workgroup_release_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr addrspace(3) %out) { entry: store atomic volatile i32 %in, ptr addrspace(3) %out syncscope("workgroup") release, align 4 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-workgroup.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-workgroup.ll index 6aaf9d323b1fd..7220c071bf657 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-workgroup.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-workgroup.ll @@ -756,18 +756,19 @@ define amdgpu_kernel void @local_workgroup_seq_cst_load( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: local_workgroup_seq_cst_load: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s1 -; GFX1250-NEXT: s_wait_dscnt 0x0 -; GFX1250-NEXT: ds_load_b32 v1, v0 -; GFX1250-NEXT: s_wait_dscnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-NEXT: ds_store_b32 v0, v1 -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: local_workgroup_seq_cst_load: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: ds_load_b32 v1, v0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %in, ptr addrspace(3) %out) { entry: %val = load atomic i32, ptr addrspace(3) %in syncscope("workgroup") seq_cst, align 4 @@ -1228,16 +1229,17 @@ define amdgpu_kernel void @local_workgroup_release_store( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: local_workgroup_release_store: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s1 -; GFX1250-NEXT: v_mov_b32_e32 v1, s0 -; GFX1250-NEXT: s_wait_dscnt 0x0 -; GFX1250-NEXT: ds_store_b32 v0, v1 -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: local_workgroup_release_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr addrspace(3) %out) { entry: store atomic i32 %in, ptr addrspace(3) %out syncscope("workgroup") release, align 4 @@ -1397,16 +1399,17 @@ define amdgpu_kernel void @local_workgroup_seq_cst_store( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: local_workgroup_seq_cst_store: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s1 -; GFX1250-NEXT: v_mov_b32_e32 v1, s0 -; GFX1250-NEXT: s_wait_dscnt 0x0 -; GFX1250-NEXT: ds_store_b32 v0, v1 -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: local_workgroup_seq_cst_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr addrspace(3) %out) { entry: store atomic i32 %in, ptr addrspace(3) %out syncscope("workgroup") seq_cst, align 4 @@ -1883,16 +1886,17 @@ define amdgpu_kernel void @local_workgroup_release_atomicrmw( ; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: local_workgroup_release_atomicrmw: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s1 -; GFX1250-NEXT: v_mov_b32_e32 v1, s0 -; GFX1250-NEXT: s_wait_dscnt 0x0 -; GFX1250-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: local_workgroup_release_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("workgroup") release @@ -2068,17 +2072,18 @@ define amdgpu_kernel void @local_workgroup_acq_rel_atomicrmw( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: local_workgroup_acq_rel_atomicrmw: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s1 -; GFX1250-NEXT: v_mov_b32_e32 v1, s0 -; GFX1250-NEXT: s_wait_dscnt 0x0 -; GFX1250-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 -; GFX1250-NEXT: s_wait_dscnt 0x0 -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: local_workgroup_acq_rel_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("workgroup") acq_rel @@ -2254,17 +2259,18 @@ define amdgpu_kernel void @local_workgroup_seq_cst_atomicrmw( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: local_workgroup_seq_cst_atomicrmw: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s1 -; GFX1250-NEXT: v_mov_b32_e32 v1, s0 -; GFX1250-NEXT: s_wait_dscnt 0x0 -; GFX1250-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 -; GFX1250-NEXT: s_wait_dscnt 0x0 -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: local_workgroup_seq_cst_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("workgroup") seq_cst @@ -2672,19 +2678,20 @@ define amdgpu_kernel void @local_workgroup_acq_rel_ret_atomicrmw( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: local_workgroup_acq_rel_ret_atomicrmw: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-NEXT: s_wait_dscnt 0x0 -; GFX1250-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 -; GFX1250-NEXT: s_wait_dscnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-NEXT: ds_store_b32 v0, v1 -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: local_workgroup_acq_rel_ret_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("workgroup") acq_rel @@ -2892,19 +2899,20 @@ define amdgpu_kernel void @local_workgroup_seq_cst_ret_atomicrmw( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: local_workgroup_seq_cst_ret_atomicrmw: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-NEXT: s_wait_dscnt 0x0 -; GFX1250-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 -; GFX1250-NEXT: s_wait_dscnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-NEXT: ds_store_b32 v0, v1 -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: local_workgroup_seq_cst_ret_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("workgroup") seq_cst @@ -3466,18 +3474,19 @@ define amdgpu_kernel void @local_workgroup_release_monotonic_cmpxchg( ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: local_workgroup_release_monotonic_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-NEXT: s_wait_dscnt 0x0 -; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: local_workgroup_release_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -3680,19 +3689,20 @@ define amdgpu_kernel void @local_workgroup_acq_rel_monotonic_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: local_workgroup_acq_rel_monotonic_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-NEXT: s_wait_dscnt 0x0 -; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 -; GFX1250-NEXT: s_wait_dscnt 0x0 -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: local_workgroup_acq_rel_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -3895,19 +3905,20 @@ define amdgpu_kernel void @local_workgroup_seq_cst_monotonic_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: local_workgroup_seq_cst_monotonic_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-NEXT: s_wait_dscnt 0x0 -; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 -; GFX1250-NEXT: s_wait_dscnt 0x0 -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: local_workgroup_seq_cst_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -4502,19 +4513,20 @@ define amdgpu_kernel void @local_workgroup_release_acquire_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: local_workgroup_release_acquire_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-NEXT: s_wait_dscnt 0x0 -; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 -; GFX1250-NEXT: s_wait_dscnt 0x0 -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: local_workgroup_release_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -4717,19 +4729,20 @@ define amdgpu_kernel void @local_workgroup_acq_rel_acquire_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: local_workgroup_acq_rel_acquire_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-NEXT: s_wait_dscnt 0x0 -; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 -; GFX1250-NEXT: s_wait_dscnt 0x0 -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: local_workgroup_acq_rel_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -4932,19 +4945,20 @@ define amdgpu_kernel void @local_workgroup_seq_cst_acquire_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: local_workgroup_seq_cst_acquire_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-NEXT: s_wait_dscnt 0x0 -; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 -; GFX1250-NEXT: s_wait_dscnt 0x0 -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: local_workgroup_seq_cst_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -5147,19 +5161,20 @@ define amdgpu_kernel void @local_workgroup_monotonic_seq_cst_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: local_workgroup_monotonic_seq_cst_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-NEXT: s_wait_dscnt 0x0 -; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 -; GFX1250-NEXT: s_wait_dscnt 0x0 -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: local_workgroup_monotonic_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -5362,19 +5377,20 @@ define amdgpu_kernel void @local_workgroup_acquire_seq_cst_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: local_workgroup_acquire_seq_cst_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-NEXT: s_wait_dscnt 0x0 -; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 -; GFX1250-NEXT: s_wait_dscnt 0x0 -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: local_workgroup_acquire_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -5577,19 +5593,20 @@ define amdgpu_kernel void @local_workgroup_release_seq_cst_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: local_workgroup_release_seq_cst_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-NEXT: s_wait_dscnt 0x0 -; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 -; GFX1250-NEXT: s_wait_dscnt 0x0 -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: local_workgroup_release_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -5792,19 +5809,20 @@ define amdgpu_kernel void @local_workgroup_acq_rel_seq_cst_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: local_workgroup_acq_rel_seq_cst_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-NEXT: s_wait_dscnt 0x0 -; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 -; GFX1250-NEXT: s_wait_dscnt 0x0 -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: local_workgroup_acq_rel_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -6007,19 +6025,20 @@ define amdgpu_kernel void @local_workgroup_seq_cst_seq_cst_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: local_workgroup_seq_cst_seq_cst_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-NEXT: s_wait_dscnt 0x0 -; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 -; GFX1250-NEXT: s_wait_dscnt 0x0 -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: local_workgroup_seq_cst_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -6705,21 +6724,22 @@ define amdgpu_kernel void @local_workgroup_release_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: local_workgroup_release_monotonic_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 -; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-NEXT: s_wait_dscnt 0x0 -; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX1250-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-NEXT: s_wait_dscnt 0x0 -; GFX1250-NEXT: ds_store_b32 v0, v1 -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: local_workgroup_release_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -6955,21 +6975,22 @@ define amdgpu_kernel void @local_workgroup_acq_rel_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: local_workgroup_acq_rel_monotonic_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 -; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-NEXT: s_wait_dscnt 0x0 -; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX1250-NEXT: s_wait_dscnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-NEXT: ds_store_b32 v0, v1 -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: local_workgroup_acq_rel_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -7205,21 +7226,22 @@ define amdgpu_kernel void @local_workgroup_seq_cst_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: local_workgroup_seq_cst_monotonic_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 -; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-NEXT: s_wait_dscnt 0x0 -; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX1250-NEXT: s_wait_dscnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-NEXT: ds_store_b32 v0, v1 -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: local_workgroup_seq_cst_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -7917,21 +7939,22 @@ define amdgpu_kernel void @local_workgroup_release_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: local_workgroup_release_acquire_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 -; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-NEXT: s_wait_dscnt 0x0 -; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX1250-NEXT: s_wait_dscnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-NEXT: ds_store_b32 v0, v1 -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: local_workgroup_release_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -8167,21 +8190,22 @@ define amdgpu_kernel void @local_workgroup_acq_rel_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: local_workgroup_acq_rel_acquire_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 -; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-NEXT: s_wait_dscnt 0x0 -; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX1250-NEXT: s_wait_dscnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-NEXT: ds_store_b32 v0, v1 -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: local_workgroup_acq_rel_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -8417,21 +8441,22 @@ define amdgpu_kernel void @local_workgroup_seq_cst_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: local_workgroup_seq_cst_acquire_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 -; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-NEXT: s_wait_dscnt 0x0 -; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX1250-NEXT: s_wait_dscnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-NEXT: ds_store_b32 v0, v1 -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: local_workgroup_seq_cst_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -8667,21 +8692,22 @@ define amdgpu_kernel void @local_workgroup_monotonic_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: local_workgroup_monotonic_seq_cst_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 -; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-NEXT: s_wait_dscnt 0x0 -; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX1250-NEXT: s_wait_dscnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-NEXT: ds_store_b32 v0, v1 -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: local_workgroup_monotonic_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -8917,21 +8943,22 @@ define amdgpu_kernel void @local_workgroup_acquire_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: local_workgroup_acquire_seq_cst_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 -; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-NEXT: s_wait_dscnt 0x0 -; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX1250-NEXT: s_wait_dscnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-NEXT: ds_store_b32 v0, v1 -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: local_workgroup_acquire_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -9167,21 +9194,22 @@ define amdgpu_kernel void @local_workgroup_release_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: local_workgroup_release_seq_cst_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 -; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-NEXT: s_wait_dscnt 0x0 -; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX1250-NEXT: s_wait_dscnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-NEXT: ds_store_b32 v0, v1 -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: local_workgroup_release_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -9417,21 +9445,22 @@ define amdgpu_kernel void @local_workgroup_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: local_workgroup_acq_rel_seq_cst_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 -; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-NEXT: s_wait_dscnt 0x0 -; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX1250-NEXT: s_wait_dscnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-NEXT: ds_store_b32 v0, v1 -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: local_workgroup_acq_rel_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -9667,21 +9696,22 @@ define amdgpu_kernel void @local_workgroup_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-LABEL: local_workgroup_seq_cst_seq_cst_ret_cmpxchg: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 -; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-NEXT: s_wait_dscnt 0x0 -; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX1250-NEXT: s_wait_dscnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-NEXT: ds_store_b32 v0, v1 -; GFX1250-NEXT: s_endpgm +; GFX1250-CU-LABEL: local_workgroup_seq_cst_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 From df1db640dbfeb785251e02a254663c0046c01149 Mon Sep 17 00:00:00 2001 From: pvanhout Date: Thu, 21 Aug 2025 12:35:47 +0200 Subject: [PATCH 2/5] clang-format --- llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp index 95fa03fc97e69..0451b27bc81c5 100644 --- a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp +++ b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp @@ -2412,7 +2412,7 @@ bool SIGfx12CacheControl::insertWait(MachineBasicBlock::iterator &MI, switch (Scope) { case SIAtomicScope::SYSTEM: case SIAtomicScope::AGENT: - case SIAtomicScope::WORKGROUP: + case SIAtomicScope::WORKGROUP: // If no cross address space ordering then an "S_WAITCNT lgkmcnt(0)" is // not needed as LDS operations for all waves are executed in a total // global ordering as observed by all waves. Required if also From 52f20d7b43b03e8e5e31a7819ae9fd78e3f15192 Mon Sep 17 00:00:00 2001 From: pvanhout Date: Thu, 21 Aug 2025 12:44:08 +0200 Subject: [PATCH 3/5] Drop -CU suffix --- .../memory-legalizer-fence-mmra-global.ll | 240 +- .../CodeGen/AMDGPU/memory-legalizer-fence.ll | 240 +- .../AMDGPU/memory-legalizer-flat-agent.ll | 2930 ++++++++--------- .../AMDGPU/memory-legalizer-flat-lastuse.ll | 20 +- .../memory-legalizer-flat-nontemporal.ll | 20 +- .../memory-legalizer-flat-singlethread.ll | 2304 ++++++------- .../AMDGPU/memory-legalizer-flat-system.ll | 2930 ++++++++--------- .../AMDGPU/memory-legalizer-flat-volatile.ll | 72 +- .../AMDGPU/memory-legalizer-flat-wavefront.ll | 2272 ++++++------- .../AMDGPU/memory-legalizer-flat-workgroup.ll | 2512 +++++++------- .../AMDGPU/memory-legalizer-global-agent.ll | 2868 ++++++++-------- .../AMDGPU/memory-legalizer-global-lastuse.ll | 20 +- .../memory-legalizer-global-nontemporal.ll | 20 +- .../memory-legalizer-global-singlethread.ll | 2304 ++++++------- .../AMDGPU/memory-legalizer-global-system.ll | 2706 +++++++-------- .../memory-legalizer-global-volatile.ll | 72 +- .../memory-legalizer-global-wavefront.ll | 2304 ++++++------- .../memory-legalizer-global-workgroup.ll | 2648 +++++++-------- .../AMDGPU/memory-legalizer-local-agent.ll | 854 ++--- .../AMDGPU/memory-legalizer-local-system.ll | 854 ++--- .../AMDGPU/memory-legalizer-local-volatile.ll | 22 +- .../memory-legalizer-local-workgroup.ll | 854 ++--- 22 files changed, 14533 insertions(+), 14533 deletions(-) diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence-mmra-global.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence-mmra-global.ll index 209775314a505..6a76f4307dcad 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence-mmra-global.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence-mmra-global.ll @@ -80,11 +80,11 @@ define amdgpu_kernel void @workgroup_acquire_fence() { ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: workgroup_acquire_fence: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: workgroup_acquire_fence: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_endpgm entry: fence syncscope("workgroup") acquire, !mmra !{!"amdgpu-synchronize-as", !"global"} ret void @@ -153,11 +153,11 @@ define amdgpu_kernel void @workgroup_release_fence() { ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: workgroup_release_fence: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: workgroup_release_fence: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_endpgm entry: fence syncscope("workgroup") release, !mmra !{!"amdgpu-synchronize-as", !"global"} ret void @@ -231,11 +231,11 @@ define amdgpu_kernel void @workgroup_acq_rel_fence() { ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: workgroup_acq_rel_fence: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: workgroup_acq_rel_fence: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_endpgm entry: fence syncscope("workgroup") acq_rel, !mmra !{!"amdgpu-synchronize-as", !"global"} ret void @@ -309,11 +309,11 @@ define amdgpu_kernel void @workgroup_seq_cst_fence() { ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: workgroup_seq_cst_fence: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: workgroup_seq_cst_fence: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_endpgm entry: fence syncscope("workgroup") seq_cst, !mmra !{!"amdgpu-synchronize-as", !"global"} ret void @@ -385,11 +385,11 @@ define amdgpu_kernel void @workgroup_one_as_acquire_fence() { ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: workgroup_one_as_acquire_fence: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: workgroup_one_as_acquire_fence: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_endpgm entry: fence syncscope("workgroup-one-as") acquire, !mmra !{!"amdgpu-synchronize-as", !"global"} ret void @@ -458,11 +458,11 @@ define amdgpu_kernel void @workgroup_one_as_release_fence() { ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: workgroup_one_as_release_fence: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: workgroup_one_as_release_fence: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_endpgm entry: fence syncscope("workgroup-one-as") release, !mmra !{!"amdgpu-synchronize-as", !"global"} ret void @@ -536,11 +536,11 @@ define amdgpu_kernel void @workgroup_one_as_acq_rel_fence() { ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: workgroup_one_as_acq_rel_fence: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: workgroup_one_as_acq_rel_fence: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_endpgm entry: fence syncscope("workgroup-one-as") acq_rel, !mmra !{!"amdgpu-synchronize-as", !"global"} ret void @@ -614,11 +614,11 @@ define amdgpu_kernel void @workgroup_one_as_seq_cst_fence() { ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: workgroup_one_as_seq_cst_fence: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: workgroup_one_as_seq_cst_fence: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_endpgm entry: fence syncscope("workgroup-one-as") seq_cst, !mmra !{!"amdgpu-synchronize-as", !"global"} ret void @@ -801,12 +801,12 @@ define amdgpu_kernel void @agent_release_fence() { ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: agent_release_fence: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: agent_release_fence: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_endpgm entry: fence syncscope("agent") release, !mmra !{!"amdgpu-synchronize-as", !"global"} ret void @@ -906,13 +906,13 @@ define amdgpu_kernel void @agent_acq_rel_fence() { ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: agent_acq_rel_fence: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: agent_acq_rel_fence: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm entry: fence syncscope("agent") acq_rel, !mmra !{!"amdgpu-synchronize-as", !"global"} ret void @@ -1012,13 +1012,13 @@ define amdgpu_kernel void @agent_seq_cst_fence() { ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: agent_seq_cst_fence: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: agent_seq_cst_fence: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm entry: fence syncscope("agent") seq_cst, !mmra !{!"amdgpu-synchronize-as", !"global"} ret void @@ -1201,12 +1201,12 @@ define amdgpu_kernel void @agent_one_as_release_fence() { ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: agent_one_as_release_fence: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: agent_one_as_release_fence: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_endpgm entry: fence syncscope("agent-one-as") release, !mmra !{!"amdgpu-synchronize-as", !"global"} ret void @@ -1306,13 +1306,13 @@ define amdgpu_kernel void @agent_one_as_acq_rel_fence() { ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: agent_one_as_acq_rel_fence: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: agent_one_as_acq_rel_fence: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm entry: fence syncscope("agent-one-as") acq_rel, !mmra !{!"amdgpu-synchronize-as", !"global"} ret void @@ -1412,13 +1412,13 @@ define amdgpu_kernel void @agent_one_as_seq_cst_fence() { ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: agent_one_as_seq_cst_fence: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: agent_one_as_seq_cst_fence: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm entry: fence syncscope("agent-one-as") seq_cst, !mmra !{!"amdgpu-synchronize-as", !"global"} ret void @@ -1607,12 +1607,12 @@ define amdgpu_kernel void @system_release_fence() { ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: system_release_fence: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: system_release_fence: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_endpgm entry: fence release, !mmra !{!"amdgpu-synchronize-as", !"global"} ret void @@ -1718,13 +1718,13 @@ define amdgpu_kernel void @system_acq_rel_fence() { ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: system_acq_rel_fence: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: system_acq_rel_fence: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm entry: fence acq_rel, !mmra !{!"amdgpu-synchronize-as", !"global"} ret void @@ -1830,13 +1830,13 @@ define amdgpu_kernel void @system_seq_cst_fence() { ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: system_seq_cst_fence: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: system_seq_cst_fence: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm entry: fence seq_cst, !mmra !{!"amdgpu-synchronize-as", !"global"} ret void @@ -2025,12 +2025,12 @@ define amdgpu_kernel void @system_one_as_release_fence() { ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: system_one_as_release_fence: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: system_one_as_release_fence: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_endpgm entry: fence syncscope("one-as") release, !mmra !{!"amdgpu-synchronize-as", !"global"} ret void @@ -2136,13 +2136,13 @@ define amdgpu_kernel void @system_one_as_acq_rel_fence() { ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: system_one_as_acq_rel_fence: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: system_one_as_acq_rel_fence: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm entry: fence syncscope("one-as") acq_rel, !mmra !{!"amdgpu-synchronize-as", !"global"} ret void @@ -2248,13 +2248,13 @@ define amdgpu_kernel void @system_one_as_seq_cst_fence() { ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: system_one_as_seq_cst_fence: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: system_one_as_seq_cst_fence: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm entry: fence syncscope("one-as") seq_cst, !mmra !{!"amdgpu-synchronize-as", !"global"} ret void diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence.ll index 07db15ee8e60e..736a8b58466dd 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence.ll @@ -1064,11 +1064,11 @@ define amdgpu_kernel void @workgroup_acquire_fence() { ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: workgroup_acquire_fence: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: workgroup_acquire_fence: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_endpgm entry: fence syncscope("workgroup") acquire ret void @@ -1145,11 +1145,11 @@ define amdgpu_kernel void @workgroup_release_fence() { ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: workgroup_release_fence: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: workgroup_release_fence: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_endpgm entry: fence syncscope("workgroup") release ret void @@ -1231,11 +1231,11 @@ define amdgpu_kernel void @workgroup_acq_rel_fence() { ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: workgroup_acq_rel_fence: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: workgroup_acq_rel_fence: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_endpgm entry: fence syncscope("workgroup") acq_rel ret void @@ -1317,11 +1317,11 @@ define amdgpu_kernel void @workgroup_seq_cst_fence() { ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: workgroup_seq_cst_fence: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: workgroup_seq_cst_fence: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_endpgm entry: fence syncscope("workgroup") seq_cst ret void @@ -1393,11 +1393,11 @@ define amdgpu_kernel void @workgroup_one_as_acquire_fence() { ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: workgroup_one_as_acquire_fence: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: workgroup_one_as_acquire_fence: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_endpgm entry: fence syncscope("workgroup-one-as") acquire ret void @@ -1466,11 +1466,11 @@ define amdgpu_kernel void @workgroup_one_as_release_fence() { ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: workgroup_one_as_release_fence: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: workgroup_one_as_release_fence: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_endpgm entry: fence syncscope("workgroup-one-as") release ret void @@ -1544,11 +1544,11 @@ define amdgpu_kernel void @workgroup_one_as_acq_rel_fence() { ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: workgroup_one_as_acq_rel_fence: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: workgroup_one_as_acq_rel_fence: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_endpgm entry: fence syncscope("workgroup-one-as") acq_rel ret void @@ -1622,11 +1622,11 @@ define amdgpu_kernel void @workgroup_one_as_seq_cst_fence() { ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: workgroup_one_as_seq_cst_fence: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: workgroup_one_as_seq_cst_fence: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_endpgm entry: fence syncscope("workgroup-one-as") seq_cst ret void @@ -1809,12 +1809,12 @@ define amdgpu_kernel void @agent_release_fence() { ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: agent_release_fence: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: agent_release_fence: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_endpgm entry: fence syncscope("agent") release ret void @@ -1914,13 +1914,13 @@ define amdgpu_kernel void @agent_acq_rel_fence() { ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: agent_acq_rel_fence: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: agent_acq_rel_fence: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm entry: fence syncscope("agent") acq_rel ret void @@ -2020,13 +2020,13 @@ define amdgpu_kernel void @agent_seq_cst_fence() { ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: agent_seq_cst_fence: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: agent_seq_cst_fence: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm entry: fence syncscope("agent") seq_cst ret void @@ -2209,12 +2209,12 @@ define amdgpu_kernel void @agent_one_as_release_fence() { ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: agent_one_as_release_fence: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: agent_one_as_release_fence: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_endpgm entry: fence syncscope("agent-one-as") release ret void @@ -2314,13 +2314,13 @@ define amdgpu_kernel void @agent_one_as_acq_rel_fence() { ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: agent_one_as_acq_rel_fence: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: agent_one_as_acq_rel_fence: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm entry: fence syncscope("agent-one-as") acq_rel ret void @@ -2420,13 +2420,13 @@ define amdgpu_kernel void @agent_one_as_seq_cst_fence() { ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: agent_one_as_seq_cst_fence: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: agent_one_as_seq_cst_fence: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm entry: fence syncscope("agent-one-as") seq_cst ret void @@ -2615,12 +2615,12 @@ define amdgpu_kernel void @system_release_fence() { ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: system_release_fence: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: system_release_fence: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_endpgm entry: fence release ret void @@ -2726,13 +2726,13 @@ define amdgpu_kernel void @system_acq_rel_fence() { ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: system_acq_rel_fence: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: system_acq_rel_fence: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm entry: fence acq_rel ret void @@ -2838,13 +2838,13 @@ define amdgpu_kernel void @system_seq_cst_fence() { ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: system_seq_cst_fence: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: system_seq_cst_fence: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm entry: fence seq_cst ret void @@ -3033,12 +3033,12 @@ define amdgpu_kernel void @system_one_as_release_fence() { ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: system_one_as_release_fence: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: system_one_as_release_fence: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_endpgm entry: fence syncscope("one-as") release ret void @@ -3144,13 +3144,13 @@ define amdgpu_kernel void @system_one_as_acq_rel_fence() { ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: system_one_as_acq_rel_fence: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: system_one_as_acq_rel_fence: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm entry: fence syncscope("one-as") acq_rel ret void @@ -3256,13 +3256,13 @@ define amdgpu_kernel void @system_one_as_seq_cst_fence() { ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: system_one_as_seq_cst_fence: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: system_one_as_seq_cst_fence: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm entry: fence syncscope("one-as") seq_cst ret void diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-agent.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-agent.ll index fe7fd8522bd6a..55ec0c2255f9b 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-agent.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-agent.ll @@ -825,19 +825,19 @@ define amdgpu_kernel void @flat_agent_seq_cst_load( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_agent_seq_cst_load: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: flat_load_b32 v1, v0, s[2:3] scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_agent_seq_cst_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: flat_load_b32 v1, v0, s[2:3] scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %in, ptr %out) { entry: %val = load atomic i32, ptr %in syncscope("agent") seq_cst, align 4 @@ -989,16 +989,16 @@ define amdgpu_kernel void @flat_agent_unordered_store( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_agent_unordered_store: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_agent_unordered_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm i32 %in, ptr %out) { entry: store atomic i32 %in, ptr %out syncscope("agent") unordered, align 4 @@ -1149,16 +1149,16 @@ define amdgpu_kernel void @flat_agent_monotonic_store( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_agent_monotonic_store: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_agent_monotonic_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm i32 %in, ptr %out) { entry: store atomic i32 %in, ptr %out syncscope("agent") monotonic, align 4 @@ -1333,19 +1333,19 @@ define amdgpu_kernel void @flat_agent_release_store( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_agent_release_store: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_agent_release_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm i32 %in, ptr %out) { entry: store atomic i32 %in, ptr %out syncscope("agent") release, align 4 @@ -1520,19 +1520,19 @@ define amdgpu_kernel void @flat_agent_seq_cst_store( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_agent_seq_cst_store: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_agent_seq_cst_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm i32 %in, ptr %out) { entry: store atomic i32 %in, ptr %out syncscope("agent") seq_cst, align 4 @@ -1683,16 +1683,16 @@ define amdgpu_kernel void @flat_agent_monotonic_atomicrmw( ; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_agent_monotonic_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_agent_monotonic_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("agent") monotonic @@ -1874,18 +1874,18 @@ define amdgpu_kernel void @flat_agent_acquire_atomicrmw( ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_agent_acquire_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_agent_acquire_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("agent") acquire @@ -2060,19 +2060,19 @@ define amdgpu_kernel void @flat_agent_release_atomicrmw( ; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_agent_release_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_agent_release_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("agent") release @@ -2278,21 +2278,21 @@ define amdgpu_kernel void @flat_agent_acq_rel_atomicrmw( ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_agent_acq_rel_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_agent_acq_rel_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("agent") acq_rel @@ -2498,21 +2498,21 @@ define amdgpu_kernel void @flat_agent_seq_cst_atomicrmw( ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_agent_seq_cst_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_agent_seq_cst_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("agent") seq_cst @@ -2722,19 +2722,19 @@ define amdgpu_kernel void @flat_agent_acquire_ret_atomicrmw( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_agent_acquire_ret_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_agent_acquire_ret_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("agent") acquire @@ -2973,22 +2973,22 @@ define amdgpu_kernel void @flat_agent_acq_rel_ret_atomicrmw( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_agent_acq_rel_ret_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_agent_acq_rel_ret_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("agent") acq_rel @@ -3227,22 +3227,22 @@ define amdgpu_kernel void @flat_agent_seq_cst_ret_atomicrmw( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_agent_seq_cst_ret_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_agent_seq_cst_ret_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("agent") seq_cst @@ -3482,20 +3482,20 @@ define amdgpu_kernel void @flat_agent_monotonic_monotonic_cmpxchg( ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_agent_monotonic_monotonic_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_agent_monotonic_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -3766,22 +3766,22 @@ define amdgpu_kernel void @flat_agent_acquire_monotonic_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_agent_acquire_monotonic_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_agent_acquire_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -4045,23 +4045,23 @@ define amdgpu_kernel void @flat_agent_release_monotonic_cmpxchg( ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_agent_release_monotonic_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_agent_release_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -4356,25 +4356,25 @@ define amdgpu_kernel void @flat_agent_acq_rel_monotonic_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_agent_acq_rel_monotonic_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_agent_acq_rel_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -4669,25 +4669,25 @@ define amdgpu_kernel void @flat_agent_seq_cst_monotonic_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_agent_seq_cst_monotonic_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_agent_seq_cst_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -4958,22 +4958,22 @@ define amdgpu_kernel void @flat_agent_monotonic_acquire_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_agent_monotonic_acquire_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_agent_monotonic_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -5244,22 +5244,22 @@ define amdgpu_kernel void @flat_agent_acquire_acquire_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_agent_acquire_acquire_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_agent_acquire_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -5554,25 +5554,25 @@ define amdgpu_kernel void @flat_agent_release_acquire_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_agent_release_acquire_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_agent_release_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -5867,25 +5867,25 @@ define amdgpu_kernel void @flat_agent_acq_rel_acquire_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_agent_acq_rel_acquire_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_agent_acq_rel_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -6180,25 +6180,25 @@ define amdgpu_kernel void @flat_agent_seq_cst_acquire_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_agent_seq_cst_acquire_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_agent_seq_cst_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -6493,25 +6493,25 @@ define amdgpu_kernel void @flat_agent_monotonic_seq_cst_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_agent_monotonic_seq_cst_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_agent_monotonic_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -6806,25 +6806,25 @@ define amdgpu_kernel void @flat_agent_acquire_seq_cst_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_agent_acquire_seq_cst_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_agent_acquire_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -7119,25 +7119,25 @@ define amdgpu_kernel void @flat_agent_release_seq_cst_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_agent_release_seq_cst_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_agent_release_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -7432,25 +7432,25 @@ define amdgpu_kernel void @flat_agent_acq_rel_seq_cst_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_agent_acq_rel_seq_cst_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_agent_acq_rel_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -7745,25 +7745,25 @@ define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_agent_seq_cst_seq_cst_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_agent_seq_cst_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -8047,22 +8047,22 @@ define amdgpu_kernel void @flat_agent_monotonic_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_agent_monotonic_monotonic_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_agent_monotonic_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -8363,23 +8363,23 @@ define amdgpu_kernel void @flat_agent_acquire_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_agent_acquire_monotonic_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_agent_acquire_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -8689,25 +8689,25 @@ define amdgpu_kernel void @flat_agent_release_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_agent_release_monotonic_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_agent_release_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -9036,26 +9036,26 @@ define amdgpu_kernel void @flat_agent_acq_rel_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_agent_acq_rel_monotonic_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_agent_acq_rel_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -9384,26 +9384,26 @@ define amdgpu_kernel void @flat_agent_seq_cst_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_agent_seq_cst_monotonic_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_agent_seq_cst_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -9708,23 +9708,23 @@ define amdgpu_kernel void @flat_agent_monotonic_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_agent_monotonic_acquire_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_agent_monotonic_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -10025,23 +10025,23 @@ define amdgpu_kernel void @flat_agent_acquire_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_agent_acquire_acquire_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_agent_acquire_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -10370,26 +10370,26 @@ define amdgpu_kernel void @flat_agent_release_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_agent_release_acquire_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_agent_release_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -10718,26 +10718,26 @@ define amdgpu_kernel void @flat_agent_acq_rel_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_agent_acq_rel_acquire_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_agent_acq_rel_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -11066,26 +11066,26 @@ define amdgpu_kernel void @flat_agent_seq_cst_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_agent_seq_cst_acquire_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_agent_seq_cst_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -11414,26 +11414,26 @@ define amdgpu_kernel void @flat_agent_monotonic_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_agent_monotonic_seq_cst_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_agent_monotonic_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -11758,26 +11758,26 @@ define amdgpu_kernel void @flat_agent_acquire_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_agent_acquire_seq_cst_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_agent_acquire_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -12106,26 +12106,26 @@ define amdgpu_kernel void @flat_agent_release_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_agent_release_seq_cst_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_agent_release_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -12454,26 +12454,26 @@ define amdgpu_kernel void @flat_agent_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_agent_acq_rel_seq_cst_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_agent_acq_rel_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -12802,26 +12802,26 @@ define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_agent_seq_cst_seq_cst_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_agent_seq_cst_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -13664,20 +13664,20 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_load( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_agent_one_as_seq_cst_load: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: flat_load_b32 v1, v0, s[2:3] scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_agent_one_as_seq_cst_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: flat_load_b32 v1, v0, s[2:3] scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %in, ptr %out) { entry: %val = load atomic i32, ptr %in syncscope("agent-one-as") seq_cst, align 4 @@ -13829,16 +13829,16 @@ define amdgpu_kernel void @flat_agent_one_as_unordered_store( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_agent_one_as_unordered_store: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_agent_one_as_unordered_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm i32 %in, ptr %out) { entry: store atomic i32 %in, ptr %out syncscope("agent-one-as") unordered, align 4 @@ -13989,16 +13989,16 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_store( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_agent_one_as_monotonic_store: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_agent_one_as_monotonic_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm i32 %in, ptr %out) { entry: store atomic i32 %in, ptr %out syncscope("agent-one-as") monotonic, align 4 @@ -14173,19 +14173,19 @@ define amdgpu_kernel void @flat_agent_one_as_release_store( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_agent_one_as_release_store: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_agent_one_as_release_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm i32 %in, ptr %out) { entry: store atomic i32 %in, ptr %out syncscope("agent-one-as") release, align 4 @@ -14360,19 +14360,19 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_store( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_agent_one_as_seq_cst_store: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_agent_one_as_seq_cst_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm i32 %in, ptr %out) { entry: store atomic i32 %in, ptr %out syncscope("agent-one-as") seq_cst, align 4 @@ -14523,16 +14523,16 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_atomicrmw( ; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_agent_one_as_monotonic_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_agent_one_as_monotonic_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("agent-one-as") monotonic @@ -14710,18 +14710,18 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_atomicrmw( ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_agent_one_as_acquire_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_agent_one_as_acquire_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("agent-one-as") acquire @@ -14896,19 +14896,19 @@ define amdgpu_kernel void @flat_agent_one_as_release_atomicrmw( ; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_agent_one_as_release_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_agent_one_as_release_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("agent-one-as") release @@ -15110,21 +15110,21 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_atomicrmw( ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_agent_one_as_acq_rel_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_agent_one_as_acq_rel_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("agent-one-as") acq_rel @@ -15326,21 +15326,21 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_atomicrmw( ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_agent_one_as_seq_cst_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_agent_one_as_seq_cst_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("agent-one-as") seq_cst @@ -15560,20 +15560,20 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_ret_atomicrmw( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_agent_one_as_acquire_ret_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_agent_one_as_acquire_ret_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("agent-one-as") acquire @@ -15822,23 +15822,23 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_ret_atomicrmw( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_agent_one_as_acq_rel_ret_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_agent_one_as_acq_rel_ret_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("agent-one-as") acq_rel @@ -16087,23 +16087,23 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_ret_atomicrmw( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_agent_one_as_seq_cst_ret_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_agent_one_as_seq_cst_ret_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("agent-one-as") seq_cst @@ -16343,20 +16343,20 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_monotonic_cmpxchg( ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_agent_one_as_monotonic_monotonic_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_agent_one_as_monotonic_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -16623,22 +16623,22 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_monotonic_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_agent_one_as_acquire_monotonic_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_agent_one_as_acquire_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -16902,23 +16902,23 @@ define amdgpu_kernel void @flat_agent_one_as_release_monotonic_cmpxchg( ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_agent_one_as_release_monotonic_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_agent_one_as_release_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -17209,25 +17209,25 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_agent_one_as_acq_rel_monotonic_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_agent_one_as_acq_rel_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -17518,25 +17518,25 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_agent_one_as_seq_cst_monotonic_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_agent_one_as_seq_cst_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -17803,22 +17803,22 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_acquire_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_agent_one_as_monotonic_acquire_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_agent_one_as_monotonic_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -18085,22 +18085,22 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_acquire_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_agent_one_as_acquire_acquire_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_agent_one_as_acquire_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -18391,25 +18391,25 @@ define amdgpu_kernel void @flat_agent_one_as_release_acquire_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_agent_one_as_release_acquire_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_agent_one_as_release_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -18700,25 +18700,25 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_agent_one_as_acq_rel_acquire_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_agent_one_as_acq_rel_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -19009,25 +19009,25 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_agent_one_as_seq_cst_acquire_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_agent_one_as_seq_cst_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -19318,25 +19318,25 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_seq_cst_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_agent_one_as_monotonic_seq_cst_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_agent_one_as_monotonic_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -19627,25 +19627,25 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_seq_cst_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_agent_one_as_acquire_seq_cst_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_agent_one_as_acquire_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -19936,25 +19936,25 @@ define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_agent_one_as_release_seq_cst_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_agent_one_as_release_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -20245,25 +20245,25 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_agent_one_as_acq_rel_seq_cst_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_agent_one_as_acq_rel_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -20554,25 +20554,25 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_agent_one_as_seq_cst_seq_cst_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_agent_one_as_seq_cst_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -20856,22 +20856,22 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_agent_one_as_monotonic_monotonic_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_agent_one_as_monotonic_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -21182,24 +21182,24 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_agent_one_as_acquire_monotonic_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_agent_one_as_acquire_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -21509,25 +21509,25 @@ define amdgpu_kernel void @flat_agent_one_as_release_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_agent_one_as_release_monotonic_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_agent_one_as_release_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -21866,27 +21866,27 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -22225,27 +22225,27 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -22560,24 +22560,24 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_agent_one_as_monotonic_acquire_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_agent_one_as_monotonic_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -22888,24 +22888,24 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_agent_one_as_acquire_acquire_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_agent_one_as_acquire_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -23244,27 +23244,27 @@ define amdgpu_kernel void @flat_agent_one_as_release_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_agent_one_as_release_acquire_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_agent_one_as_release_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -23603,27 +23603,27 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_agent_one_as_acq_rel_acquire_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_agent_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -23962,27 +23962,27 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_agent_one_as_seq_cst_acquire_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_agent_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -24321,27 +24321,27 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_agent_one_as_monotonic_seq_cst_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_agent_one_as_monotonic_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -24676,27 +24676,27 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_agent_one_as_acquire_seq_cst_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_agent_one_as_acquire_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -25035,27 +25035,27 @@ define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_agent_one_as_release_seq_cst_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_agent_one_as_release_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -25394,27 +25394,27 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_agent_one_as_acq_rel_seq_cst_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_agent_one_as_acq_rel_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -25753,27 +25753,27 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-lastuse.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-lastuse.ll index 22c1b6f9fe875..faa970e049bd2 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-lastuse.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-lastuse.ll @@ -107,16 +107,16 @@ define amdgpu_kernel void @flat_last_use_and_volatile_load(ptr %in, ptr %out) { ; GFX12-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_last_use_and_volatile_load: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: flat_load_b32 v1, v0, s[2:3] th:TH_LOAD_BYPASS scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_last_use_and_volatile_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: flat_load_b32 v1, v0, s[2:3] th:TH_LOAD_BYPASS scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm entry: %val = load volatile i32, ptr %in, align 4, !amdgpu.last.use !{} store i32 %val, ptr %out diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-nontemporal.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-nontemporal.ll index c949790b97d72..721ecd8da5387 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-nontemporal.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-nontemporal.ll @@ -1322,16 +1322,16 @@ define amdgpu_kernel void @flat_nontemporal_volatile_load( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_nontemporal_volatile_load: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: flat_load_b32 v1, v0, s[2:3] th:TH_LOAD_NT scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_nontemporal_volatile_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: flat_load_b32 v1, v0, s[2:3] th:TH_LOAD_NT scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %in, ptr %out) { entry: %val = load volatile i32, ptr %in, align 4, !nontemporal !0 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-singlethread.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-singlethread.ll index 8a75db2c36dc7..635895259ee32 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-singlethread.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-singlethread.ll @@ -929,16 +929,16 @@ define amdgpu_kernel void @flat_singlethread_unordered_store( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_singlethread_unordered_store: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_singlethread_unordered_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm i32 %in, ptr %out) { entry: store atomic i32 %in, ptr %out syncscope("singlethread") unordered, align 4 @@ -1089,16 +1089,16 @@ define amdgpu_kernel void @flat_singlethread_monotonic_store( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_singlethread_monotonic_store: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_singlethread_monotonic_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm i32 %in, ptr %out) { entry: store atomic i32 %in, ptr %out syncscope("singlethread") monotonic, align 4 @@ -1249,16 +1249,16 @@ define amdgpu_kernel void @flat_singlethread_release_store( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_singlethread_release_store: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_singlethread_release_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm i32 %in, ptr %out) { entry: store atomic i32 %in, ptr %out syncscope("singlethread") release, align 4 @@ -1409,16 +1409,16 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_store( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_singlethread_seq_cst_store: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_singlethread_seq_cst_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm i32 %in, ptr %out) { entry: store atomic i32 %in, ptr %out syncscope("singlethread") seq_cst, align 4 @@ -1569,16 +1569,16 @@ define amdgpu_kernel void @flat_singlethread_monotonic_atomicrmw( ; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_singlethread_monotonic_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_singlethread_monotonic_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("singlethread") monotonic @@ -1729,16 +1729,16 @@ define amdgpu_kernel void @flat_singlethread_acquire_atomicrmw( ; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_singlethread_acquire_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_singlethread_acquire_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("singlethread") acquire @@ -1889,16 +1889,16 @@ define amdgpu_kernel void @flat_singlethread_release_atomicrmw( ; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_singlethread_release_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_singlethread_release_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("singlethread") release @@ -2049,16 +2049,16 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_atomicrmw( ; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_singlethread_acq_rel_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_singlethread_acq_rel_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("singlethread") acq_rel @@ -2209,16 +2209,16 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_atomicrmw( ; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_singlethread_seq_cst_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_singlethread_seq_cst_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("singlethread") seq_cst @@ -2413,18 +2413,18 @@ define amdgpu_kernel void @flat_singlethread_acquire_ret_atomicrmw( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_singlethread_acquire_ret_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_singlethread_acquire_ret_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("singlethread") acquire @@ -2620,18 +2620,18 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_ret_atomicrmw( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_singlethread_acq_rel_ret_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_singlethread_acq_rel_ret_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("singlethread") acq_rel @@ -2827,18 +2827,18 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_ret_atomicrmw( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_singlethread_seq_cst_ret_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_singlethread_seq_cst_ret_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("singlethread") seq_cst @@ -3078,20 +3078,20 @@ define amdgpu_kernel void @flat_singlethread_monotonic_monotonic_cmpxchg( ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_singlethread_monotonic_monotonic_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_singlethread_monotonic_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -3331,20 +3331,20 @@ define amdgpu_kernel void @flat_singlethread_acquire_monotonic_cmpxchg( ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_singlethread_acquire_monotonic_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_singlethread_acquire_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -3584,20 +3584,20 @@ define amdgpu_kernel void @flat_singlethread_release_monotonic_cmpxchg( ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_singlethread_release_monotonic_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_singlethread_release_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -3837,20 +3837,20 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_cmpxchg( ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_singlethread_acq_rel_monotonic_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_singlethread_acq_rel_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -4090,20 +4090,20 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_cmpxchg( ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_singlethread_seq_cst_monotonic_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_singlethread_seq_cst_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -4343,20 +4343,20 @@ define amdgpu_kernel void @flat_singlethread_monotonic_acquire_cmpxchg( ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_singlethread_monotonic_acquire_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_singlethread_monotonic_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -4596,20 +4596,20 @@ define amdgpu_kernel void @flat_singlethread_acquire_acquire_cmpxchg( ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_singlethread_acquire_acquire_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_singlethread_acquire_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -4849,20 +4849,20 @@ define amdgpu_kernel void @flat_singlethread_release_acquire_cmpxchg( ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_singlethread_release_acquire_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_singlethread_release_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -5102,20 +5102,20 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_cmpxchg( ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_singlethread_acq_rel_acquire_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_singlethread_acq_rel_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -5355,20 +5355,20 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_cmpxchg( ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_singlethread_seq_cst_acquire_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_singlethread_seq_cst_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -5608,20 +5608,20 @@ define amdgpu_kernel void @flat_singlethread_monotonic_seq_cst_cmpxchg( ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_singlethread_monotonic_seq_cst_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_singlethread_monotonic_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -5861,20 +5861,20 @@ define amdgpu_kernel void @flat_singlethread_acquire_seq_cst_cmpxchg( ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_singlethread_acquire_seq_cst_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_singlethread_acquire_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -6114,20 +6114,20 @@ define amdgpu_kernel void @flat_singlethread_release_seq_cst_cmpxchg( ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_singlethread_release_seq_cst_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_singlethread_release_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -6367,20 +6367,20 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_seq_cst_cmpxchg( ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_singlethread_acq_rel_seq_cst_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_singlethread_acq_rel_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -6620,20 +6620,20 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_cmpxchg( ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_singlethread_seq_cst_seq_cst_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_singlethread_seq_cst_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -6917,22 +6917,22 @@ define amdgpu_kernel void @flat_singlethread_monotonic_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_singlethread_monotonic_monotonic_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_singlethread_monotonic_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -7218,22 +7218,22 @@ define amdgpu_kernel void @flat_singlethread_acquire_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_singlethread_acquire_monotonic_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_singlethread_acquire_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -7519,22 +7519,22 @@ define amdgpu_kernel void @flat_singlethread_release_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_singlethread_release_monotonic_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_singlethread_release_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -7820,22 +7820,22 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_singlethread_acq_rel_monotonic_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_singlethread_acq_rel_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -8121,22 +8121,22 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_singlethread_seq_cst_monotonic_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_singlethread_seq_cst_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -8422,22 +8422,22 @@ define amdgpu_kernel void @flat_singlethread_monotonic_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_singlethread_monotonic_acquire_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_singlethread_monotonic_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -8723,22 +8723,22 @@ define amdgpu_kernel void @flat_singlethread_acquire_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_singlethread_acquire_acquire_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_singlethread_acquire_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -9024,22 +9024,22 @@ define amdgpu_kernel void @flat_singlethread_release_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_singlethread_release_acquire_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_singlethread_release_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -9325,22 +9325,22 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_singlethread_acq_rel_acquire_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_singlethread_acq_rel_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -9626,22 +9626,22 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_singlethread_seq_cst_acquire_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_singlethread_seq_cst_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -9927,22 +9927,22 @@ define amdgpu_kernel void @flat_singlethread_monotonic_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_singlethread_monotonic_seq_cst_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_singlethread_monotonic_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -10228,22 +10228,22 @@ define amdgpu_kernel void @flat_singlethread_acquire_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_singlethread_acquire_seq_cst_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_singlethread_acquire_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -10529,22 +10529,22 @@ define amdgpu_kernel void @flat_singlethread_release_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_singlethread_release_seq_cst_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_singlethread_release_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -10830,22 +10830,22 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_singlethread_acq_rel_seq_cst_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_singlethread_acq_rel_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -11131,22 +11131,22 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_singlethread_seq_cst_seq_cst_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_singlethread_seq_cst_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -12072,16 +12072,16 @@ define amdgpu_kernel void @flat_singlethread_one_as_unordered_store( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_singlethread_one_as_unordered_store: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_singlethread_one_as_unordered_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm i32 %in, ptr %out) { entry: store atomic i32 %in, ptr %out syncscope("singlethread-one-as") unordered, align 4 @@ -12232,16 +12232,16 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_store( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_singlethread_one_as_monotonic_store: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_singlethread_one_as_monotonic_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm i32 %in, ptr %out) { entry: store atomic i32 %in, ptr %out syncscope("singlethread-one-as") monotonic, align 4 @@ -12392,16 +12392,16 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_store( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_singlethread_one_as_release_store: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_singlethread_one_as_release_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm i32 %in, ptr %out) { entry: store atomic i32 %in, ptr %out syncscope("singlethread-one-as") release, align 4 @@ -12552,16 +12552,16 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_store( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_singlethread_one_as_seq_cst_store: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_singlethread_one_as_seq_cst_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm i32 %in, ptr %out) { entry: store atomic i32 %in, ptr %out syncscope("singlethread-one-as") seq_cst, align 4 @@ -12712,16 +12712,16 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_atomicrmw( ; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_singlethread_one_as_monotonic_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_singlethread_one_as_monotonic_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("singlethread-one-as") monotonic @@ -12872,16 +12872,16 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_atomicrmw( ; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_singlethread_one_as_acquire_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_singlethread_one_as_acquire_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("singlethread-one-as") acquire @@ -13032,16 +13032,16 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_atomicrmw( ; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_singlethread_one_as_release_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_singlethread_one_as_release_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("singlethread-one-as") release @@ -13192,16 +13192,16 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_atomicrmw( ; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_singlethread_one_as_acq_rel_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_singlethread_one_as_acq_rel_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("singlethread-one-as") acq_rel @@ -13352,16 +13352,16 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_atomicrmw( ; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_singlethread_one_as_seq_cst_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_singlethread_one_as_seq_cst_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("singlethread-one-as") seq_cst @@ -13556,18 +13556,18 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_ret_atomicrmw( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_singlethread_one_as_acquire_ret_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_singlethread_one_as_acquire_ret_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("singlethread-one-as") acquire @@ -13763,18 +13763,18 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_ret_atomicrmw( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_singlethread_one_as_acq_rel_ret_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_singlethread_one_as_acq_rel_ret_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("singlethread-one-as") acq_rel @@ -13970,18 +13970,18 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_ret_atomicrmw( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_singlethread_one_as_seq_cst_ret_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_singlethread_one_as_seq_cst_ret_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("singlethread-one-as") seq_cst @@ -14221,20 +14221,20 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_monotonic_cmpxchg( ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_singlethread_one_as_monotonic_monotonic_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_singlethread_one_as_monotonic_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -14474,20 +14474,20 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_cmpxchg( ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_singlethread_one_as_acquire_monotonic_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_singlethread_one_as_acquire_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -14727,20 +14727,20 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_cmpxchg( ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_singlethread_one_as_release_monotonic_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_singlethread_one_as_release_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -14980,20 +14980,20 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_cmpxchg( ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_singlethread_one_as_acq_rel_monotonic_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_singlethread_one_as_acq_rel_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -15233,20 +15233,20 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_cmpxchg( ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_singlethread_one_as_seq_cst_monotonic_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_singlethread_one_as_seq_cst_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -15486,20 +15486,20 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_acquire_cmpxchg( ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_singlethread_one_as_monotonic_acquire_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_singlethread_one_as_monotonic_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -15739,20 +15739,20 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_cmpxchg( ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_singlethread_one_as_acquire_acquire_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_singlethread_one_as_acquire_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -15992,20 +15992,20 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_cmpxchg( ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_singlethread_one_as_release_acquire_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_singlethread_one_as_release_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -16245,20 +16245,20 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_cmpxchg( ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_singlethread_one_as_acq_rel_acquire_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_singlethread_one_as_acq_rel_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -16498,20 +16498,20 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_cmpxchg( ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_singlethread_one_as_seq_cst_acquire_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_singlethread_one_as_seq_cst_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -16751,20 +16751,20 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_seq_cst_cmpxchg( ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_singlethread_one_as_monotonic_seq_cst_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_singlethread_one_as_monotonic_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -17004,20 +17004,20 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_seq_cst_cmpxchg( ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_singlethread_one_as_acquire_seq_cst_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_singlethread_one_as_acquire_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -17257,20 +17257,20 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_seq_cst_cmpxchg( ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_singlethread_one_as_release_seq_cst_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_singlethread_one_as_release_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -17510,20 +17510,20 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg( ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -17763,20 +17763,20 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg( ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -18060,22 +18060,22 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_monotonic_ret_cmpx ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_singlethread_one_as_monotonic_monotonic_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_singlethread_one_as_monotonic_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -18361,22 +18361,22 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_ret_cmpxch ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_singlethread_one_as_acquire_monotonic_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_singlethread_one_as_acquire_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -18662,22 +18662,22 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_ret_cmpxch ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_singlethread_one_as_release_monotonic_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_singlethread_one_as_release_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -18963,22 +18963,22 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxch ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -19264,22 +19264,22 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxch ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -19565,22 +19565,22 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_acquire_ret_cmpxch ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_singlethread_one_as_monotonic_acquire_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_singlethread_one_as_monotonic_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -19866,22 +19866,22 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_singlethread_one_as_acquire_acquire_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_singlethread_one_as_acquire_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -20167,22 +20167,22 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_singlethread_one_as_release_acquire_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_singlethread_one_as_release_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -20468,22 +20468,22 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -20769,22 +20769,22 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -21070,22 +21070,22 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxch ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -21371,22 +21371,22 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -21672,22 +21672,22 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_singlethread_one_as_release_seq_cst_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_singlethread_one_as_release_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -21973,22 +21973,22 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -22274,22 +22274,22 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-system.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-system.ll index b5ea23d4655b6..e45a8e51c836c 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-system.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-system.ll @@ -829,19 +829,19 @@ define amdgpu_kernel void @flat_system_seq_cst_load( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_system_seq_cst_load: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: flat_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_system_seq_cst_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: flat_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %in, ptr %out) { entry: %val = load atomic i32, ptr %in seq_cst, align 4 @@ -993,16 +993,16 @@ define amdgpu_kernel void @flat_system_unordered_store( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_system_unordered_store: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_system_unordered_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm i32 %in, ptr %out) { entry: store atomic i32 %in, ptr %out unordered, align 4 @@ -1153,16 +1153,16 @@ define amdgpu_kernel void @flat_system_monotonic_store( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_system_monotonic_store: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_system_monotonic_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm i32 %in, ptr %out) { entry: store atomic i32 %in, ptr %out monotonic, align 4 @@ -1341,19 +1341,19 @@ define amdgpu_kernel void @flat_system_release_store( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_system_release_store: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_system_release_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm i32 %in, ptr %out) { entry: store atomic i32 %in, ptr %out release, align 4 @@ -1532,19 +1532,19 @@ define amdgpu_kernel void @flat_system_seq_cst_store( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_system_seq_cst_store: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_system_seq_cst_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm i32 %in, ptr %out) { entry: store atomic i32 %in, ptr %out seq_cst, align 4 @@ -1695,16 +1695,16 @@ define amdgpu_kernel void @flat_system_monotonic_atomicrmw( ; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_system_monotonic_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_system_monotonic_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in monotonic @@ -1888,18 +1888,18 @@ define amdgpu_kernel void @flat_system_acquire_atomicrmw( ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_system_acquire_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_system_acquire_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in acquire @@ -2078,19 +2078,19 @@ define amdgpu_kernel void @flat_system_release_atomicrmw( ; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_system_release_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_system_release_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in release @@ -2302,21 +2302,21 @@ define amdgpu_kernel void @flat_system_acq_rel_atomicrmw( ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_system_acq_rel_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_system_acq_rel_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in acq_rel @@ -2528,21 +2528,21 @@ define amdgpu_kernel void @flat_system_seq_cst_atomicrmw( ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_system_seq_cst_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_system_seq_cst_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in seq_cst @@ -2754,19 +2754,19 @@ define amdgpu_kernel void @flat_system_acquire_ret_atomicrmw( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_system_acquire_ret_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_system_acquire_ret_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in acquire @@ -3011,22 +3011,22 @@ define amdgpu_kernel void @flat_system_acq_rel_ret_atomicrmw( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_system_acq_rel_ret_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_system_acq_rel_ret_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in acq_rel @@ -3271,22 +3271,22 @@ define amdgpu_kernel void @flat_system_seq_cst_ret_atomicrmw( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_system_seq_cst_ret_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_system_seq_cst_ret_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in seq_cst @@ -3526,20 +3526,20 @@ define amdgpu_kernel void @flat_system_monotonic_monotonic_cmpxchg( ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_system_monotonic_monotonic_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_system_monotonic_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -3812,22 +3812,22 @@ define amdgpu_kernel void @flat_system_acquire_monotonic_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_system_acquire_monotonic_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_system_acquire_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -4095,23 +4095,23 @@ define amdgpu_kernel void @flat_system_release_monotonic_cmpxchg( ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_system_release_monotonic_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_system_release_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -4412,25 +4412,25 @@ define amdgpu_kernel void @flat_system_acq_rel_monotonic_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_system_acq_rel_monotonic_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_system_acq_rel_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -4731,25 +4731,25 @@ define amdgpu_kernel void @flat_system_seq_cst_monotonic_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_system_seq_cst_monotonic_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_system_seq_cst_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -5022,22 +5022,22 @@ define amdgpu_kernel void @flat_system_monotonic_acquire_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_system_monotonic_acquire_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_system_monotonic_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -5310,22 +5310,22 @@ define amdgpu_kernel void @flat_system_acquire_acquire_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_system_acquire_acquire_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_system_acquire_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -5626,25 +5626,25 @@ define amdgpu_kernel void @flat_system_release_acquire_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_system_release_acquire_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_system_release_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -5945,25 +5945,25 @@ define amdgpu_kernel void @flat_system_acq_rel_acquire_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_system_acq_rel_acquire_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_system_acq_rel_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -6264,25 +6264,25 @@ define amdgpu_kernel void @flat_system_seq_cst_acquire_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_system_seq_cst_acquire_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_system_seq_cst_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -6583,25 +6583,25 @@ define amdgpu_kernel void @flat_system_monotonic_seq_cst_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_system_monotonic_seq_cst_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_system_monotonic_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -6902,25 +6902,25 @@ define amdgpu_kernel void @flat_system_acquire_seq_cst_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_system_acquire_seq_cst_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_system_acquire_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -7221,25 +7221,25 @@ define amdgpu_kernel void @flat_system_release_seq_cst_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_system_release_seq_cst_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_system_release_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -7540,25 +7540,25 @@ define amdgpu_kernel void @flat_system_acq_rel_seq_cst_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_system_acq_rel_seq_cst_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_system_acq_rel_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -7859,25 +7859,25 @@ define amdgpu_kernel void @flat_system_seq_cst_seq_cst_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_system_seq_cst_seq_cst_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_system_seq_cst_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -8161,22 +8161,22 @@ define amdgpu_kernel void @flat_system_monotonic_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_system_monotonic_monotonic_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_system_monotonic_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -8479,23 +8479,23 @@ define amdgpu_kernel void @flat_system_acquire_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_system_acquire_monotonic_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_system_acquire_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -8809,25 +8809,25 @@ define amdgpu_kernel void @flat_system_release_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_system_release_monotonic_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_system_release_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -9162,26 +9162,26 @@ define amdgpu_kernel void @flat_system_acq_rel_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_system_acq_rel_monotonic_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_system_acq_rel_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -9516,26 +9516,26 @@ define amdgpu_kernel void @flat_system_seq_cst_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_system_seq_cst_monotonic_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_system_seq_cst_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -9842,23 +9842,23 @@ define amdgpu_kernel void @flat_system_monotonic_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_system_monotonic_acquire_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_system_monotonic_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -10161,23 +10161,23 @@ define amdgpu_kernel void @flat_system_acquire_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_system_acquire_acquire_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_system_acquire_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -10512,26 +10512,26 @@ define amdgpu_kernel void @flat_system_release_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_system_release_acquire_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_system_release_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -10866,26 +10866,26 @@ define amdgpu_kernel void @flat_system_acq_rel_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_system_acq_rel_acquire_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_system_acq_rel_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -11220,26 +11220,26 @@ define amdgpu_kernel void @flat_system_seq_cst_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_system_seq_cst_acquire_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_system_seq_cst_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -11574,26 +11574,26 @@ define amdgpu_kernel void @flat_system_monotonic_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_system_monotonic_seq_cst_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_system_monotonic_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -11924,26 +11924,26 @@ define amdgpu_kernel void @flat_system_acquire_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_system_acquire_seq_cst_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_system_acquire_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -12278,26 +12278,26 @@ define amdgpu_kernel void @flat_system_release_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_system_release_seq_cst_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_system_release_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -12632,26 +12632,26 @@ define amdgpu_kernel void @flat_system_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_system_acq_rel_seq_cst_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_system_acq_rel_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -12986,26 +12986,26 @@ define amdgpu_kernel void @flat_system_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_system_seq_cst_seq_cst_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_system_seq_cst_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -13852,20 +13852,20 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_load( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_system_one_as_seq_cst_load: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: flat_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_system_one_as_seq_cst_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: flat_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %in, ptr %out) { entry: %val = load atomic i32, ptr %in syncscope("one-as") seq_cst, align 4 @@ -14017,16 +14017,16 @@ define amdgpu_kernel void @flat_system_one_as_unordered_store( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_system_one_as_unordered_store: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_system_one_as_unordered_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm i32 %in, ptr %out) { entry: store atomic i32 %in, ptr %out syncscope("one-as") unordered, align 4 @@ -14177,16 +14177,16 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_store( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_system_one_as_monotonic_store: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_system_one_as_monotonic_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm i32 %in, ptr %out) { entry: store atomic i32 %in, ptr %out syncscope("one-as") monotonic, align 4 @@ -14365,19 +14365,19 @@ define amdgpu_kernel void @flat_system_one_as_release_store( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_system_one_as_release_store: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_system_one_as_release_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm i32 %in, ptr %out) { entry: store atomic i32 %in, ptr %out syncscope("one-as") release, align 4 @@ -14556,19 +14556,19 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_store( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_system_one_as_seq_cst_store: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_system_one_as_seq_cst_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm i32 %in, ptr %out) { entry: store atomic i32 %in, ptr %out syncscope("one-as") seq_cst, align 4 @@ -14719,16 +14719,16 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_atomicrmw( ; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_system_one_as_monotonic_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_system_one_as_monotonic_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("one-as") monotonic @@ -14908,18 +14908,18 @@ define amdgpu_kernel void @flat_system_one_as_acquire_atomicrmw( ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_system_one_as_acquire_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_system_one_as_acquire_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("one-as") acquire @@ -15098,19 +15098,19 @@ define amdgpu_kernel void @flat_system_one_as_release_atomicrmw( ; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_system_one_as_release_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_system_one_as_release_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("one-as") release @@ -15318,21 +15318,21 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_atomicrmw( ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_system_one_as_acq_rel_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_system_one_as_acq_rel_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("one-as") acq_rel @@ -15540,21 +15540,21 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_atomicrmw( ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_system_one_as_seq_cst_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_system_one_as_seq_cst_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("one-as") seq_cst @@ -15776,20 +15776,20 @@ define amdgpu_kernel void @flat_system_one_as_acquire_ret_atomicrmw( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_system_one_as_acquire_ret_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_system_one_as_acquire_ret_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("one-as") acquire @@ -16044,23 +16044,23 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_ret_atomicrmw( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_system_one_as_acq_rel_ret_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_system_one_as_acq_rel_ret_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("one-as") acq_rel @@ -16315,23 +16315,23 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_ret_atomicrmw( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_system_one_as_seq_cst_ret_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_system_one_as_seq_cst_ret_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("one-as") seq_cst @@ -16571,20 +16571,20 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_monotonic_cmpxchg( ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_system_one_as_monotonic_monotonic_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_system_one_as_monotonic_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -16853,22 +16853,22 @@ define amdgpu_kernel void @flat_system_one_as_acquire_monotonic_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_system_one_as_acquire_monotonic_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_system_one_as_acquire_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -17136,23 +17136,23 @@ define amdgpu_kernel void @flat_system_one_as_release_monotonic_cmpxchg( ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_system_one_as_release_monotonic_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_system_one_as_release_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -17449,25 +17449,25 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_system_one_as_acq_rel_monotonic_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_system_one_as_acq_rel_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -17764,25 +17764,25 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_system_one_as_seq_cst_monotonic_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_system_one_as_seq_cst_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -18051,22 +18051,22 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_acquire_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_system_one_as_monotonic_acquire_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_system_one_as_monotonic_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -18335,22 +18335,22 @@ define amdgpu_kernel void @flat_system_one_as_acquire_acquire_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_system_one_as_acquire_acquire_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_system_one_as_acquire_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -18647,25 +18647,25 @@ define amdgpu_kernel void @flat_system_one_as_release_acquire_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_system_one_as_release_acquire_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_system_one_as_release_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -18962,25 +18962,25 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_system_one_as_acq_rel_acquire_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_system_one_as_acq_rel_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -19277,25 +19277,25 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_system_one_as_seq_cst_acquire_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_system_one_as_seq_cst_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -19592,25 +19592,25 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_seq_cst_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_system_one_as_monotonic_seq_cst_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_system_one_as_monotonic_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -19907,25 +19907,25 @@ define amdgpu_kernel void @flat_system_one_as_acquire_seq_cst_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_system_one_as_acquire_seq_cst_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_system_one_as_acquire_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -20222,25 +20222,25 @@ define amdgpu_kernel void @flat_system_one_as_release_seq_cst_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_system_one_as_release_seq_cst_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_system_one_as_release_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -20537,25 +20537,25 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_system_one_as_acq_rel_seq_cst_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_system_one_as_acq_rel_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -20852,25 +20852,25 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_system_one_as_seq_cst_seq_cst_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_system_one_as_seq_cst_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -21154,22 +21154,22 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_system_one_as_monotonic_monotonic_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_system_one_as_monotonic_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -21482,24 +21482,24 @@ define amdgpu_kernel void @flat_system_one_as_acquire_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_system_one_as_acquire_monotonic_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_system_one_as_acquire_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -21813,25 +21813,25 @@ define amdgpu_kernel void @flat_system_one_as_release_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_system_one_as_release_monotonic_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_system_one_as_release_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -22176,27 +22176,27 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_system_one_as_acq_rel_monotonic_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_system_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -22541,27 +22541,27 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_system_one_as_seq_cst_monotonic_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_system_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -22878,24 +22878,24 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_system_one_as_monotonic_acquire_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_system_one_as_monotonic_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -23208,24 +23208,24 @@ define amdgpu_kernel void @flat_system_one_as_acquire_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_system_one_as_acquire_acquire_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_system_one_as_acquire_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -23570,27 +23570,27 @@ define amdgpu_kernel void @flat_system_one_as_release_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_system_one_as_release_acquire_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_system_one_as_release_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -23935,27 +23935,27 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_system_one_as_acq_rel_acquire_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_system_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -24300,27 +24300,27 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_system_one_as_seq_cst_acquire_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_system_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -24665,27 +24665,27 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_system_one_as_monotonic_seq_cst_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_system_one_as_monotonic_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -25026,27 +25026,27 @@ define amdgpu_kernel void @flat_system_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_system_one_as_acquire_seq_cst_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_system_one_as_acquire_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -25391,27 +25391,27 @@ define amdgpu_kernel void @flat_system_one_as_release_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_system_one_as_release_seq_cst_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_system_one_as_release_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -25756,27 +25756,27 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -26121,27 +26121,27 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-volatile.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-volatile.ll index 68af003ba6353..41c5927cad4de 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-volatile.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-volatile.ll @@ -145,16 +145,16 @@ define amdgpu_kernel void @flat_nontemporal_load_0( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_nontemporal_load_0: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: flat_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_nontemporal_load_0: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: flat_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %in, ptr %out) { entry: %val = load volatile i32, ptr %in, align 4 @@ -420,20 +420,20 @@ define amdgpu_kernel void @flat_nontemporal_load_1( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_nontemporal_load_1: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, v0 -; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: s_mov_b32 s4, 0x3ff -; GFX1250-CU-NEXT: v_and_b32_e64 v1, v1, s4 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: flat_load_b32 v1, v1, s[2:3] scale_offset scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_nontemporal_load_1: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v1, v0 +; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_mov_b32 s4, 0x3ff +; GFX1250-NEXT: v_and_b32_e64 v1, v1, s4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: flat_load_b32 v1, v1, s[2:3] scale_offset scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %in, ptr %out) { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -1140,18 +1140,18 @@ define amdgpu_kernel void @flat_volatile_workgroup_release_store( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_volatile_workgroup_release_store: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_volatile_workgroup_release_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm i32 %in, ptr %out) { entry: store atomic volatile i32 %in, ptr %out syncscope("workgroup") release, align 4 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-wavefront.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-wavefront.ll index a4804675fd3cf..041b3f51abc2f 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-wavefront.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-wavefront.ll @@ -929,16 +929,16 @@ define amdgpu_kernel void @flat_wavefront_unordered_store( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_wavefront_unordered_store: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_wavefront_unordered_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm i32 %in, ptr %out) { entry: store atomic i32 %in, ptr %out syncscope("wavefront") unordered, align 4 @@ -1089,16 +1089,16 @@ define amdgpu_kernel void @flat_wavefront_monotonic_store( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_wavefront_monotonic_store: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_wavefront_monotonic_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm i32 %in, ptr %out) { entry: store atomic i32 %in, ptr %out syncscope("wavefront") monotonic, align 4 @@ -1249,16 +1249,16 @@ define amdgpu_kernel void @flat_wavefront_release_store( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_wavefront_release_store: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_wavefront_release_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm i32 %in, ptr %out) { entry: store atomic i32 %in, ptr %out syncscope("wavefront") release, align 4 @@ -1409,16 +1409,16 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_store( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_wavefront_seq_cst_store: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_wavefront_seq_cst_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm i32 %in, ptr %out) { entry: store atomic i32 %in, ptr %out syncscope("wavefront") seq_cst, align 4 @@ -1569,16 +1569,16 @@ define amdgpu_kernel void @flat_wavefront_monotonic_atomicrmw( ; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_wavefront_monotonic_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_wavefront_monotonic_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("wavefront") monotonic @@ -1729,16 +1729,16 @@ define amdgpu_kernel void @flat_wavefront_acquire_atomicrmw( ; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_wavefront_acquire_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_wavefront_acquire_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("wavefront") acquire @@ -1889,16 +1889,16 @@ define amdgpu_kernel void @flat_wavefront_release_atomicrmw( ; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_wavefront_release_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_wavefront_release_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("wavefront") release @@ -2049,16 +2049,16 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_atomicrmw( ; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_wavefront_acq_rel_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_wavefront_acq_rel_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("wavefront") acq_rel @@ -2209,16 +2209,16 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_atomicrmw( ; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_wavefront_seq_cst_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_wavefront_seq_cst_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("wavefront") seq_cst @@ -2413,18 +2413,18 @@ define amdgpu_kernel void @flat_wavefront_acquire_ret_atomicrmw( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_wavefront_acquire_ret_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_wavefront_acquire_ret_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("wavefront") acquire @@ -2620,18 +2620,18 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_ret_atomicrmw( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_wavefront_acq_rel_ret_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_wavefront_acq_rel_ret_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("wavefront") acq_rel @@ -2827,18 +2827,18 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_ret_atomicrmw( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_wavefront_seq_cst_ret_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_wavefront_seq_cst_ret_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("wavefront") seq_cst @@ -3078,20 +3078,20 @@ define amdgpu_kernel void @flat_wavefront_monotonic_monotonic_cmpxchg( ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_wavefront_monotonic_monotonic_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_wavefront_monotonic_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -3331,20 +3331,20 @@ define amdgpu_kernel void @flat_wavefront_acquire_monotonic_cmpxchg( ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_wavefront_acquire_monotonic_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_wavefront_acquire_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -3584,20 +3584,20 @@ define amdgpu_kernel void @flat_wavefront_release_monotonic_cmpxchg( ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_wavefront_release_monotonic_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_wavefront_release_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -3837,20 +3837,20 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_monotonic_cmpxchg( ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_wavefront_acq_rel_monotonic_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_wavefront_acq_rel_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -4090,20 +4090,20 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_monotonic_cmpxchg( ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_wavefront_seq_cst_monotonic_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_wavefront_seq_cst_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -4343,20 +4343,20 @@ define amdgpu_kernel void @flat_wavefront_monotonic_acquire_cmpxchg( ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_wavefront_monotonic_acquire_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_wavefront_monotonic_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -4596,20 +4596,20 @@ define amdgpu_kernel void @flat_wavefront_acquire_acquire_cmpxchg( ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_wavefront_acquire_acquire_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_wavefront_acquire_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -4849,20 +4849,20 @@ define amdgpu_kernel void @flat_wavefront_release_acquire_cmpxchg( ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_wavefront_release_acquire_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_wavefront_release_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -5102,20 +5102,20 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_acquire_cmpxchg( ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_wavefront_acq_rel_acquire_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_wavefront_acq_rel_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -5355,20 +5355,20 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_acquire_cmpxchg( ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_wavefront_seq_cst_acquire_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_wavefront_seq_cst_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -5608,20 +5608,20 @@ define amdgpu_kernel void @flat_wavefront_monotonic_seq_cst_cmpxchg( ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_wavefront_monotonic_seq_cst_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_wavefront_monotonic_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -5861,20 +5861,20 @@ define amdgpu_kernel void @flat_wavefront_acquire_seq_cst_cmpxchg( ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_wavefront_acquire_seq_cst_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_wavefront_acquire_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -6114,20 +6114,20 @@ define amdgpu_kernel void @flat_wavefront_release_seq_cst_cmpxchg( ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_wavefront_release_seq_cst_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_wavefront_release_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -6367,20 +6367,20 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_seq_cst_cmpxchg( ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_wavefront_acq_rel_seq_cst_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_wavefront_acq_rel_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -6620,20 +6620,20 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_seq_cst_cmpxchg( ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_wavefront_seq_cst_seq_cst_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_wavefront_seq_cst_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -6917,22 +6917,22 @@ define amdgpu_kernel void @flat_wavefront_monotonic_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_wavefront_monotonic_monotonic_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_wavefront_monotonic_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -7218,22 +7218,22 @@ define amdgpu_kernel void @flat_wavefront_acquire_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_wavefront_acquire_monotonic_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_wavefront_acquire_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -7519,22 +7519,22 @@ define amdgpu_kernel void @flat_wavefront_release_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_wavefront_release_monotonic_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_wavefront_release_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -7820,22 +7820,22 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_wavefront_acq_rel_monotonic_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_wavefront_acq_rel_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -8121,22 +8121,22 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_wavefront_seq_cst_monotonic_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_wavefront_seq_cst_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -8422,22 +8422,22 @@ define amdgpu_kernel void @flat_wavefront_monotonic_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_wavefront_monotonic_acquire_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_wavefront_monotonic_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -8723,22 +8723,22 @@ define amdgpu_kernel void @flat_wavefront_acquire_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_wavefront_acquire_acquire_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_wavefront_acquire_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -9024,22 +9024,22 @@ define amdgpu_kernel void @flat_wavefront_release_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_wavefront_release_acquire_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_wavefront_release_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -9325,22 +9325,22 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_wavefront_acq_rel_acquire_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_wavefront_acq_rel_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -9626,22 +9626,22 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_wavefront_seq_cst_acquire_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_wavefront_seq_cst_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -9927,22 +9927,22 @@ define amdgpu_kernel void @flat_wavefront_monotonic_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_wavefront_monotonic_seq_cst_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_wavefront_monotonic_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -10228,22 +10228,22 @@ define amdgpu_kernel void @flat_wavefront_acquire_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_wavefront_acquire_seq_cst_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_wavefront_acquire_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -10529,22 +10529,22 @@ define amdgpu_kernel void @flat_wavefront_release_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_wavefront_release_seq_cst_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_wavefront_release_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -10830,22 +10830,22 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_wavefront_acq_rel_seq_cst_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_wavefront_acq_rel_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -11131,22 +11131,22 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_wavefront_seq_cst_seq_cst_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_wavefront_seq_cst_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -12072,16 +12072,16 @@ define amdgpu_kernel void @flat_wavefront_one_as_unordered_store( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_wavefront_one_as_unordered_store: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_wavefront_one_as_unordered_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm i32 %in, ptr %out) { entry: store atomic i32 %in, ptr %out syncscope("wavefront-one-as") unordered, align 4 @@ -12232,16 +12232,16 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_store( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_wavefront_one_as_monotonic_store: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_wavefront_one_as_monotonic_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm i32 %in, ptr %out) { entry: store atomic i32 %in, ptr %out syncscope("wavefront-one-as") monotonic, align 4 @@ -12392,16 +12392,16 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_store( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_wavefront_one_as_release_store: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_wavefront_one_as_release_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm i32 %in, ptr %out) { entry: store atomic i32 %in, ptr %out syncscope("wavefront-one-as") release, align 4 @@ -12552,16 +12552,16 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_store( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_wavefront_one_as_seq_cst_store: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_wavefront_one_as_seq_cst_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm i32 %in, ptr %out) { entry: store atomic i32 %in, ptr %out syncscope("wavefront-one-as") seq_cst, align 4 @@ -12712,16 +12712,16 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_atomicrmw( ; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_wavefront_one_as_monotonic_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_wavefront_one_as_monotonic_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("wavefront-one-as") monotonic @@ -12872,16 +12872,16 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_atomicrmw( ; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_wavefront_one_as_acquire_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_wavefront_one_as_acquire_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("wavefront-one-as") acquire @@ -13032,16 +13032,16 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_atomicrmw( ; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_wavefront_one_as_release_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_wavefront_one_as_release_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("wavefront-one-as") release @@ -13192,16 +13192,16 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_atomicrmw( ; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_wavefront_one_as_acq_rel_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_wavefront_one_as_acq_rel_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("wavefront-one-as") acq_rel @@ -13352,16 +13352,16 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_atomicrmw( ; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_wavefront_one_as_seq_cst_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_wavefront_one_as_seq_cst_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("wavefront-one-as") seq_cst @@ -13556,18 +13556,18 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_ret_atomicrmw( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_wavefront_one_as_acquire_ret_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_wavefront_one_as_acquire_ret_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("wavefront-one-as") acquire @@ -13763,18 +13763,18 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_ret_atomicrmw( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_wavefront_one_as_acq_rel_ret_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_wavefront_one_as_acq_rel_ret_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("wavefront-one-as") acq_rel @@ -13970,18 +13970,18 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_ret_atomicrmw( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_wavefront_one_as_seq_cst_ret_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_wavefront_one_as_seq_cst_ret_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("wavefront-one-as") seq_cst @@ -14221,20 +14221,20 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_monotonic_cmpxchg( ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_wavefront_one_as_monotonic_monotonic_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_wavefront_one_as_monotonic_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -14474,20 +14474,20 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_monotonic_cmpxchg( ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_wavefront_one_as_acquire_monotonic_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_wavefront_one_as_acquire_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -14727,20 +14727,20 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_monotonic_cmpxchg( ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_wavefront_one_as_release_monotonic_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_wavefront_one_as_release_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -14980,20 +14980,20 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_monotonic_cmpxchg( ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_wavefront_one_as_acq_rel_monotonic_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_wavefront_one_as_acq_rel_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -15233,20 +15233,20 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_monotonic_cmpxchg( ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_wavefront_one_as_seq_cst_monotonic_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_wavefront_one_as_seq_cst_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -15486,20 +15486,20 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_acquire_cmpxchg( ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_wavefront_one_as_monotonic_acquire_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_wavefront_one_as_monotonic_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -15739,20 +15739,20 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_acquire_cmpxchg( ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_wavefront_one_as_acquire_acquire_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_wavefront_one_as_acquire_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -15992,20 +15992,20 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_acquire_cmpxchg( ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_wavefront_one_as_release_acquire_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_wavefront_one_as_release_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -16245,20 +16245,20 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_acquire_cmpxchg( ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_wavefront_one_as_acq_rel_acquire_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_wavefront_one_as_acq_rel_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -16498,20 +16498,20 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_acquire_cmpxchg( ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_wavefront_one_as_seq_cst_acquire_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_wavefront_one_as_seq_cst_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -16751,20 +16751,20 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_seq_cst_cmpxchg( ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_wavefront_one_as_monotonic_seq_cst_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_wavefront_one_as_monotonic_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -17004,20 +17004,20 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_seq_cst_cmpxchg( ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_wavefront_one_as_acquire_seq_cst_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_wavefront_one_as_acquire_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -17257,20 +17257,20 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_seq_cst_cmpxchg( ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_wavefront_one_as_release_seq_cst_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_wavefront_one_as_release_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -17510,20 +17510,20 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg( ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -17763,20 +17763,20 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg( ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -18060,22 +18060,22 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_monotonic_ret_cmpxchg ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_wavefront_one_as_monotonic_monotonic_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_wavefront_one_as_monotonic_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -18361,22 +18361,22 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -18662,22 +18662,22 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -18963,22 +18963,22 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -19264,22 +19264,22 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_wavefront_one_as_monotonic_acquire_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_wavefront_one_as_monotonic_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -19565,22 +19565,22 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_wavefront_one_as_acquire_acquire_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_wavefront_one_as_acquire_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -19866,22 +19866,22 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_wavefront_one_as_release_acquire_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_wavefront_one_as_release_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -20167,22 +20167,22 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -20468,22 +20468,22 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -20769,22 +20769,22 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -21070,22 +21070,22 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_wavefront_one_as_acquire_seq_cst_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_wavefront_one_as_acquire_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -21371,22 +21371,22 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_wavefront_one_as_release_seq_cst_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_wavefront_one_as_release_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -21672,22 +21672,22 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_relc_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_wavefront_one_as_acq_relc_seq_cst_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_wavefront_one_as_acq_relc_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -21973,22 +21973,22 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-workgroup.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-workgroup.ll index 01801637ce770..85ecab8128d2f 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-workgroup.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-workgroup.ll @@ -811,18 +811,18 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_load( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_workgroup_seq_cst_load: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: flat_load_b32 v1, v0, s[2:3] -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_workgroup_seq_cst_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: flat_load_b32 v1, v0, s[2:3] +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %in, ptr %out) { entry: %val = load atomic i32, ptr %in syncscope("workgroup") seq_cst, align 4 @@ -974,16 +974,16 @@ define amdgpu_kernel void @flat_workgroup_unordered_store( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_workgroup_unordered_store: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_workgroup_unordered_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm i32 %in, ptr %out) { entry: store atomic i32 %in, ptr %out syncscope("workgroup") unordered, align 4 @@ -1134,16 +1134,16 @@ define amdgpu_kernel void @flat_workgroup_monotonic_store( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_workgroup_monotonic_store: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_workgroup_monotonic_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm i32 %in, ptr %out) { entry: store atomic i32 %in, ptr %out syncscope("workgroup") monotonic, align 4 @@ -1311,18 +1311,18 @@ define amdgpu_kernel void @flat_workgroup_release_store( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_workgroup_release_store: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_workgroup_release_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm i32 %in, ptr %out) { entry: store atomic i32 %in, ptr %out syncscope("workgroup") release, align 4 @@ -1490,18 +1490,18 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_store( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_workgroup_seq_cst_store: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_workgroup_seq_cst_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm i32 %in, ptr %out) { entry: store atomic i32 %in, ptr %out syncscope("workgroup") seq_cst, align 4 @@ -1652,16 +1652,16 @@ define amdgpu_kernel void @flat_workgroup_monotonic_atomicrmw( ; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_workgroup_monotonic_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_workgroup_monotonic_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("workgroup") monotonic @@ -1831,17 +1831,17 @@ define amdgpu_kernel void @flat_workgroup_acquire_atomicrmw( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_workgroup_acquire_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_workgroup_acquire_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("workgroup") acquire @@ -2009,18 +2009,18 @@ define amdgpu_kernel void @flat_workgroup_release_atomicrmw( ; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_workgroup_release_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_workgroup_release_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("workgroup") release @@ -2207,19 +2207,19 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_atomicrmw( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_workgroup_acq_rel_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_workgroup_acq_rel_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("workgroup") acq_rel @@ -2406,19 +2406,19 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_atomicrmw( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_workgroup_seq_cst_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_workgroup_seq_cst_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("workgroup") seq_cst @@ -2625,18 +2625,18 @@ define amdgpu_kernel void @flat_workgroup_acquire_ret_atomicrmw( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_workgroup_acquire_ret_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_workgroup_acquire_ret_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("workgroup") acquire @@ -2863,20 +2863,20 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_ret_atomicrmw( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_workgroup_acq_rel_ret_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_workgroup_acq_rel_ret_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("workgroup") acq_rel @@ -3103,20 +3103,20 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_ret_atomicrmw( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_workgroup_seq_cst_ret_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_workgroup_seq_cst_ret_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("workgroup") seq_cst @@ -3356,20 +3356,20 @@ define amdgpu_kernel void @flat_workgroup_monotonic_monotonic_cmpxchg( ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_workgroup_monotonic_monotonic_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_workgroup_monotonic_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -3628,21 +3628,21 @@ define amdgpu_kernel void @flat_workgroup_acquire_monotonic_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_workgroup_acquire_monotonic_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_workgroup_acquire_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -3899,22 +3899,22 @@ define amdgpu_kernel void @flat_workgroup_release_monotonic_cmpxchg( ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_workgroup_release_monotonic_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_workgroup_release_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -4190,23 +4190,23 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_workgroup_acq_rel_monotonic_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_workgroup_acq_rel_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -4482,23 +4482,23 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_workgroup_seq_cst_monotonic_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_workgroup_seq_cst_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -4757,21 +4757,21 @@ define amdgpu_kernel void @flat_workgroup_monotonic_acquire_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_workgroup_monotonic_acquire_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_workgroup_monotonic_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -5030,21 +5030,21 @@ define amdgpu_kernel void @flat_workgroup_acquire_acquire_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_workgroup_acquire_acquire_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_workgroup_acquire_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -5320,23 +5320,23 @@ define amdgpu_kernel void @flat_workgroup_release_acquire_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_workgroup_release_acquire_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_workgroup_release_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -5612,23 +5612,23 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_workgroup_acq_rel_acquire_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_workgroup_acq_rel_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -5904,23 +5904,23 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_workgroup_seq_cst_acquire_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_workgroup_seq_cst_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -6196,23 +6196,23 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_workgroup_seq_cst_seq_cst_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_workgroup_seq_cst_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -6496,22 +6496,22 @@ define amdgpu_kernel void @flat_workgroup_monotonic_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_workgroup_monotonic_monotonic_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_workgroup_monotonic_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -6809,22 +6809,22 @@ define amdgpu_kernel void @flat_workgroup_acquire_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_workgroup_acquire_monotonic_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_workgroup_acquire_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -7127,24 +7127,24 @@ define amdgpu_kernel void @flat_workgroup_release_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_workgroup_release_monotonic_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_workgroup_release_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -7461,24 +7461,24 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_workgroup_acq_rel_monotonic_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_workgroup_acq_rel_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -7795,24 +7795,24 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_workgroup_seq_cst_monotonic_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_workgroup_seq_cst_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -8112,22 +8112,22 @@ define amdgpu_kernel void @flat_workgroup_monotonic_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_workgroup_monotonic_acquire_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_workgroup_monotonic_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -8425,22 +8425,22 @@ define amdgpu_kernel void @flat_workgroup_acquire_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_workgroup_acquire_acquire_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_workgroup_acquire_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -8757,24 +8757,24 @@ define amdgpu_kernel void @flat_workgroup_release_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_workgroup_release_acquire_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_workgroup_release_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -9091,24 +9091,24 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_workgroup_acq_rel_acquire_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_workgroup_acq_rel_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -9425,24 +9425,24 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_workgroup_seq_cst_acquire_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_workgroup_seq_cst_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -9759,24 +9759,24 @@ define amdgpu_kernel void @flat_workgroup_monotonic_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_workgroup_monotonic_seq_cst_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_workgroup_monotonic_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -10091,24 +10091,24 @@ define amdgpu_kernel void @flat_workgroup_acquire_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_workgroup_acquire_seq_cst_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_workgroup_acquire_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -10425,24 +10425,24 @@ define amdgpu_kernel void @flat_workgroup_release_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_workgroup_release_seq_cst_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_workgroup_release_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -10759,24 +10759,24 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_workgroup_acq_rel_seq_cst_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_workgroup_acq_rel_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -11093,24 +11093,24 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_workgroup_seq_cst_seq_cst_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_workgroup_seq_cst_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -11903,18 +11903,18 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_load( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_workgroup_one_as_seq_cst_load: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: flat_load_b32 v1, v0, s[2:3] -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_workgroup_one_as_seq_cst_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: flat_load_b32 v1, v0, s[2:3] +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %in, ptr %out) { entry: %val = load atomic i32, ptr %in syncscope("workgroup-one-as") seq_cst, align 4 @@ -12066,16 +12066,16 @@ define amdgpu_kernel void @flat_workgroup_one_as_unordered_store( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_workgroup_one_as_unordered_store: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_workgroup_one_as_unordered_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm i32 %in, ptr %out) { entry: store atomic i32 %in, ptr %out syncscope("workgroup-one-as") unordered, align 4 @@ -12226,16 +12226,16 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_store( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_workgroup_one_as_monotonic_store: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_workgroup_one_as_monotonic_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm i32 %in, ptr %out) { entry: store atomic i32 %in, ptr %out syncscope("workgroup-one-as") monotonic, align 4 @@ -12396,18 +12396,18 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_store( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_workgroup_one_as_release_store: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_workgroup_one_as_release_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm i32 %in, ptr %out) { entry: store atomic i32 %in, ptr %out syncscope("workgroup-one-as") release, align 4 @@ -12568,18 +12568,18 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_store( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_workgroup_one_as_seq_cst_store: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_workgroup_one_as_seq_cst_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm i32 %in, ptr %out) { entry: store atomic i32 %in, ptr %out syncscope("workgroup-one-as") seq_cst, align 4 @@ -12730,16 +12730,16 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_atomicrmw( ; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_workgroup_one_as_monotonic_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_workgroup_one_as_monotonic_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("workgroup-one-as") monotonic @@ -12900,17 +12900,17 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_atomicrmw( ; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_workgroup_one_as_acquire_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_workgroup_one_as_acquire_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("workgroup-one-as") acquire @@ -13071,18 +13071,18 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_atomicrmw( ; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_workgroup_one_as_release_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_workgroup_one_as_release_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("workgroup-one-as") release @@ -13253,19 +13253,19 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_atomicrmw( ; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_workgroup_one_as_acq_rel_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_workgroup_one_as_acq_rel_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("workgroup-one-as") acq_rel @@ -13436,19 +13436,19 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_atomicrmw( ; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_workgroup_one_as_seq_cst_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_workgroup_one_as_seq_cst_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("workgroup-one-as") seq_cst @@ -13651,18 +13651,18 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_ret_atomicrmw( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_workgroup_one_as_acquire_ret_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_workgroup_one_as_acquire_ret_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("workgroup-one-as") acquire @@ -13878,20 +13878,20 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_ret_atomicrmw( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_workgroup_one_as_acq_rel_ret_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_workgroup_one_as_acq_rel_ret_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("workgroup-one-as") acq_rel @@ -14107,20 +14107,20 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_ret_atomicrmw( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_workgroup_one_as_seq_cst_ret_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_workgroup_one_as_seq_cst_ret_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("workgroup-one-as") seq_cst @@ -14360,20 +14360,20 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_monotonic_cmpxchg( ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_workgroup_one_as_monotonic_monotonic_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_workgroup_one_as_monotonic_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -14623,21 +14623,21 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_cmpxchg( ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_workgroup_one_as_acquire_monotonic_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_workgroup_one_as_acquire_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -14887,22 +14887,22 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_cmpxchg( ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_workgroup_one_as_release_monotonic_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_workgroup_one_as_release_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -15162,23 +15162,23 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_cmpxchg( ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_workgroup_one_as_acq_rel_monotonic_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_workgroup_one_as_acq_rel_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -15438,23 +15438,23 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_cmpxchg( ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_workgroup_one_as_seq_cst_monotonic_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_workgroup_one_as_seq_cst_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -15704,21 +15704,21 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_acquire_cmpxchg( ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_workgroup_one_as_monotonic_acquire_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_workgroup_one_as_monotonic_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -15968,21 +15968,21 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_cmpxchg( ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_workgroup_one_as_acquire_acquire_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_workgroup_one_as_acquire_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -16242,23 +16242,23 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_cmpxchg( ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_workgroup_one_as_release_acquire_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_workgroup_one_as_release_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -16518,23 +16518,23 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_cmpxchg( ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_workgroup_one_as_acq_rel_acquire_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_workgroup_one_as_acq_rel_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -16794,23 +16794,23 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_cmpxchg( ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_workgroup_one_as_seq_cst_acquire_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_workgroup_one_as_seq_cst_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -17070,23 +17070,23 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_cmpxchg( ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_workgroup_one_as_monotonic_seq_cst_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_workgroup_one_as_monotonic_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -17346,23 +17346,23 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_cmpxchg( ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_workgroup_one_as_acquire_seq_cst_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_workgroup_one_as_acquire_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -17622,23 +17622,23 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_cmpxchg( ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_workgroup_one_as_release_seq_cst_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_workgroup_one_as_release_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -17898,23 +17898,23 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg( ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -18174,23 +18174,23 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg( ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -18474,22 +18474,22 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -18783,22 +18783,22 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -19094,24 +19094,24 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_workgroup_one_as_release_monotonic_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_workgroup_one_as_release_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -19417,24 +19417,24 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -19740,24 +19740,24 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -20053,22 +20053,22 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -20362,22 +20362,22 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_workgroup_one_as_acquire_acquire_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_workgroup_one_as_acquire_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -20683,24 +20683,24 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_workgroup_one_as_release_acquire_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_workgroup_one_as_release_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -21006,24 +21006,24 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -21329,24 +21329,24 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -21652,24 +21652,24 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -21973,24 +21973,24 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -22296,24 +22296,24 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_workgroup_one_as_release_seq_cst_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_workgroup_one_as_release_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -22619,24 +22619,24 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -22942,24 +22942,24 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-agent.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-agent.ll index ad163cefe57d4..5c2d8eb4f5ec0 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-agent.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-agent.ll @@ -829,19 +829,19 @@ define amdgpu_kernel void @global_agent_seq_cst_load( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_agent_seq_cst_load: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_agent_seq_cst_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { entry: %val = load atomic i32, ptr addrspace(1) %in syncscope("agent") seq_cst, align 4 @@ -1000,16 +1000,16 @@ define amdgpu_kernel void @global_agent_unordered_store( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_agent_unordered_store: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_agent_unordered_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { entry: store atomic i32 %in, ptr addrspace(1) %out syncscope("agent") unordered, align 4 @@ -1167,16 +1167,16 @@ define amdgpu_kernel void @global_agent_monotonic_store( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_agent_monotonic_store: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_agent_monotonic_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { entry: store atomic i32 %in, ptr addrspace(1) %out syncscope("agent") monotonic, align 4 @@ -1359,19 +1359,19 @@ define amdgpu_kernel void @global_agent_release_store( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_agent_release_store: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_agent_release_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { entry: store atomic i32 %in, ptr addrspace(1) %out syncscope("agent") release, align 4 @@ -1554,19 +1554,19 @@ define amdgpu_kernel void @global_agent_seq_cst_store( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_agent_seq_cst_store: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_agent_seq_cst_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { entry: store atomic i32 %in, ptr addrspace(1) %out syncscope("agent") seq_cst, align 4 @@ -1722,16 +1722,16 @@ define amdgpu_kernel void @global_agent_monotonic_atomicrmw( ; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_agent_monotonic_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_agent_monotonic_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("agent") monotonic @@ -1916,18 +1916,18 @@ define amdgpu_kernel void @global_agent_acquire_atomicrmw( ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_agent_acquire_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_agent_acquire_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("agent") acquire @@ -2108,19 +2108,19 @@ define amdgpu_kernel void @global_agent_release_atomicrmw( ; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_agent_release_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_agent_release_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("agent") release @@ -2330,21 +2330,21 @@ define amdgpu_kernel void @global_agent_acq_rel_atomicrmw( ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_agent_acq_rel_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_agent_acq_rel_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("agent") acq_rel @@ -2554,21 +2554,21 @@ define amdgpu_kernel void @global_agent_seq_cst_atomicrmw( ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_agent_seq_cst_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_agent_seq_cst_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("agent") seq_cst @@ -2768,19 +2768,19 @@ define amdgpu_kernel void @global_agent_acquire_ret_atomicrmw( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_agent_acquire_ret_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_agent_acquire_ret_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("agent") acquire @@ -3010,22 +3010,22 @@ define amdgpu_kernel void @global_agent_acq_rel_ret_atomicrmw( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_agent_acq_rel_ret_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_agent_acq_rel_ret_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("agent") acq_rel @@ -3255,22 +3255,22 @@ define amdgpu_kernel void @global_agent_seq_cst_ret_atomicrmw( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_agent_seq_cst_ret_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_agent_seq_cst_ret_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("agent") seq_cst @@ -3491,20 +3491,20 @@ define amdgpu_kernel void @global_agent_monotonic_monotonic_cmpxchg( ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_agent_monotonic_monotonic_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_agent_monotonic_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -3754,22 +3754,22 @@ define amdgpu_kernel void @global_agent_acquire_monotonic_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_agent_acquire_monotonic_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_agent_acquire_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -4015,23 +4015,23 @@ define amdgpu_kernel void @global_agent_release_monotonic_cmpxchg( ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_agent_release_monotonic_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_agent_release_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -4306,25 +4306,25 @@ define amdgpu_kernel void @global_agent_acq_rel_monotonic_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_agent_acq_rel_monotonic_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_agent_acq_rel_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -4599,25 +4599,25 @@ define amdgpu_kernel void @global_agent_seq_cst_monotonic_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_agent_seq_cst_monotonic_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_agent_seq_cst_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -4867,22 +4867,22 @@ define amdgpu_kernel void @global_agent_monotonic_acquire_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_agent_monotonic_acquire_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_agent_monotonic_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -5132,22 +5132,22 @@ define amdgpu_kernel void @global_agent_acquire_acquire_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_agent_acquire_acquire_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_agent_acquire_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -5422,25 +5422,25 @@ define amdgpu_kernel void @global_agent_release_acquire_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_agent_release_acquire_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_agent_release_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -5715,25 +5715,25 @@ define amdgpu_kernel void @global_agent_acq_rel_acquire_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_agent_acq_rel_acquire_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_agent_acq_rel_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -6008,25 +6008,25 @@ define amdgpu_kernel void @global_agent_seq_cst_acquire_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_agent_seq_cst_acquire_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_agent_seq_cst_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -6301,25 +6301,25 @@ define amdgpu_kernel void @global_agent_monotonic_seq_cst_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_agent_monotonic_seq_cst_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_agent_monotonic_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -6594,25 +6594,25 @@ define amdgpu_kernel void @global_agent_acquire_seq_cst_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_agent_acquire_seq_cst_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_agent_acquire_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -6887,25 +6887,25 @@ define amdgpu_kernel void @global_agent_release_seq_cst_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_agent_release_seq_cst_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_agent_release_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -7180,25 +7180,25 @@ define amdgpu_kernel void @global_agent_acq_rel_seq_cst_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_agent_acq_rel_seq_cst_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_agent_acq_rel_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -7473,25 +7473,25 @@ define amdgpu_kernel void @global_agent_seq_cst_seq_cst_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_agent_seq_cst_seq_cst_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_agent_seq_cst_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -7742,22 +7742,22 @@ define amdgpu_kernel void @global_agent_monotonic_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_agent_monotonic_monotonic_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_agent_monotonic_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -8026,23 +8026,23 @@ define amdgpu_kernel void @global_agent_acquire_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_agent_acquire_monotonic_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_agent_acquire_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -8320,25 +8320,25 @@ define amdgpu_kernel void @global_agent_release_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_agent_release_monotonic_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_agent_release_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -8636,26 +8636,26 @@ define amdgpu_kernel void @global_agent_acq_rel_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_agent_acq_rel_monotonic_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_agent_acq_rel_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -8953,26 +8953,26 @@ define amdgpu_kernel void @global_agent_seq_cst_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_agent_seq_cst_monotonic_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_agent_seq_cst_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -9245,23 +9245,23 @@ define amdgpu_kernel void @global_agent_monotonic_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_agent_monotonic_acquire_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_agent_monotonic_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -9530,23 +9530,23 @@ define amdgpu_kernel void @global_agent_acquire_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_agent_acquire_acquire_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_agent_acquire_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -9844,26 +9844,26 @@ define amdgpu_kernel void @global_agent_release_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_agent_release_acquire_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_agent_release_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -10161,26 +10161,26 @@ define amdgpu_kernel void @global_agent_acq_rel_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_agent_acq_rel_acquire_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_agent_acq_rel_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -10478,26 +10478,26 @@ define amdgpu_kernel void @global_agent_seq_cst_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_agent_seq_cst_acquire_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_agent_seq_cst_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -10795,26 +10795,26 @@ define amdgpu_kernel void @global_agent_monotonic_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_agent_monotonic_seq_cst_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_agent_monotonic_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -11108,26 +11108,26 @@ define amdgpu_kernel void @global_agent_acquire_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_agent_acquire_seq_cst_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_agent_acquire_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -11425,26 +11425,26 @@ define amdgpu_kernel void @global_agent_release_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_agent_release_seq_cst_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_agent_release_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -11742,26 +11742,26 @@ define amdgpu_kernel void @global_agent_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_agent_acq_rel_seq_cst_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_agent_acq_rel_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -12059,26 +12059,26 @@ define amdgpu_kernel void @global_agent_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_agent_seq_cst_seq_cst_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_agent_seq_cst_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -12903,24 +12903,24 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_load( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_agent_one_as_seq_cst_load: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm - ptr addrspace(1) %in, ptr addrspace(1) %out) { -entry: - %val = load atomic i32, ptr addrspace(1) %in syncscope("agent-one-as") seq_cst, align 4 - store i32 %val, ptr addrspace(1) %out - ret void +; GFX1250-LABEL: global_agent_one_as_seq_cst_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm + ptr addrspace(1) %in, ptr addrspace(1) %out) { +entry: + %val = load atomic i32, ptr addrspace(1) %in syncscope("agent-one-as") seq_cst, align 4 + store i32 %val, ptr addrspace(1) %out + ret void } define amdgpu_kernel void @global_agent_one_as_unordered_store( @@ -13074,16 +13074,16 @@ define amdgpu_kernel void @global_agent_one_as_unordered_store( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_agent_one_as_unordered_store: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_agent_one_as_unordered_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { entry: store atomic i32 %in, ptr addrspace(1) %out syncscope("agent-one-as") unordered, align 4 @@ -13241,16 +13241,16 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_store( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_agent_one_as_monotonic_store: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_agent_one_as_monotonic_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { entry: store atomic i32 %in, ptr addrspace(1) %out syncscope("agent-one-as") monotonic, align 4 @@ -13433,19 +13433,19 @@ define amdgpu_kernel void @global_agent_one_as_release_store( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_agent_one_as_release_store: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_agent_one_as_release_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { entry: store atomic i32 %in, ptr addrspace(1) %out syncscope("agent-one-as") release, align 4 @@ -13628,19 +13628,19 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_store( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_agent_one_as_seq_cst_store: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_agent_one_as_seq_cst_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { entry: store atomic i32 %in, ptr addrspace(1) %out syncscope("agent-one-as") seq_cst, align 4 @@ -13796,16 +13796,16 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_atomicrmw( ; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_agent_one_as_monotonic_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_agent_one_as_monotonic_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("agent-one-as") monotonic @@ -13990,18 +13990,18 @@ define amdgpu_kernel void @global_agent_one_as_acquire_atomicrmw( ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_agent_one_as_acquire_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_agent_one_as_acquire_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("agent-one-as") acquire @@ -14182,19 +14182,19 @@ define amdgpu_kernel void @global_agent_one_as_release_atomicrmw( ; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_agent_one_as_release_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_agent_one_as_release_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("agent-one-as") release @@ -14404,21 +14404,21 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_atomicrmw( ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_agent_one_as_acq_rel_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_agent_one_as_acq_rel_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("agent-one-as") acq_rel @@ -14628,21 +14628,21 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_atomicrmw( ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_agent_one_as_seq_cst_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_agent_one_as_seq_cst_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("agent-one-as") seq_cst @@ -14842,19 +14842,19 @@ define amdgpu_kernel void @global_agent_one_as_acquire_ret_atomicrmw( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_agent_one_as_acquire_ret_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_agent_one_as_acquire_ret_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("agent-one-as") acquire @@ -15084,22 +15084,22 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_ret_atomicrmw( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_agent_one_as_acq_rel_ret_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_agent_one_as_acq_rel_ret_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("agent-one-as") acq_rel @@ -15329,22 +15329,22 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_ret_atomicrmw( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_agent_one_as_seq_cst_ret_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_agent_one_as_seq_cst_ret_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("agent-one-as") seq_cst @@ -15565,20 +15565,20 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_monotonic_cmpxchg( ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_agent_one_as_monotonic_monotonic_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_agent_one_as_monotonic_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -15828,22 +15828,22 @@ define amdgpu_kernel void @global_agent_one_as_acquire_monotonic_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_agent_one_as_acquire_monotonic_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_agent_one_as_acquire_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -16089,23 +16089,23 @@ define amdgpu_kernel void @global_agent_one_as_release_monotonic_cmpxchg( ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_agent_one_as_release_monotonic_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_agent_one_as_release_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -16380,25 +16380,25 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_monotonic_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_agent_one_as_acq_rel_monotonic_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_agent_one_as_acq_rel_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -16673,25 +16673,25 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_monotonic_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_agent_one_as_seq_cst_monotonic_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_agent_one_as_seq_cst_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -16941,22 +16941,22 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_acquire_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_agent_one_as_monotonic_acquire_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_agent_one_as_monotonic_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -17206,22 +17206,22 @@ define amdgpu_kernel void @global_agent_one_as_acquire_acquire_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_agent_one_as_acquire_acquire_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_agent_one_as_acquire_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -17496,25 +17496,25 @@ define amdgpu_kernel void @global_agent_one_as_release_acquire_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_agent_one_as_release_acquire_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_agent_one_as_release_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -17789,25 +17789,25 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_acquire_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_agent_one_as_acq_rel_acquire_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_agent_one_as_acq_rel_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -18082,25 +18082,25 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_acquire_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_agent_one_as_seq_cst_acquire_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_agent_one_as_seq_cst_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -18375,25 +18375,25 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_seq_cst_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_agent_one_as_monotonic_seq_cst_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_agent_one_as_monotonic_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -18668,25 +18668,25 @@ define amdgpu_kernel void @global_agent_one_as_acquire_seq_cst_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_agent_one_as_acquire_seq_cst_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_agent_one_as_acquire_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -18961,25 +18961,25 @@ define amdgpu_kernel void @global_agent_one_as_release_seq_cst_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_agent_one_as_release_seq_cst_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_agent_one_as_release_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -19254,25 +19254,25 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_seq_cst_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_agent_one_as_acq_rel_seq_cst_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_agent_one_as_acq_rel_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -19547,25 +19547,25 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_seq_cst_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_agent_one_as_seq_cst_seq_cst_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_agent_one_as_seq_cst_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -19816,22 +19816,22 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_agent_one_as_monotonic_monotonic_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_agent_one_as_monotonic_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -20100,23 +20100,23 @@ define amdgpu_kernel void @global_agent_one_as_acquire_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_agent_one_as_acquire_monotonic_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_agent_one_as_acquire_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -20414,26 +20414,26 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_agent_one_as_acq_rel_monotonic_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_agent_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -20731,26 +20731,26 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_agent_one_as_seq_cst_monotonic_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_agent_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -21023,23 +21023,23 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_agent_one_as_monotonic_acquire_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_agent_one_as_monotonic_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -21308,23 +21308,23 @@ define amdgpu_kernel void @global_agent_one_as_acquire_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_agent_one_as_acquire_acquire_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_agent_one_as_acquire_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -21622,26 +21622,26 @@ define amdgpu_kernel void @global_agent_one_as_release_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_agent_one_as_release_acquire_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_agent_one_as_release_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -21939,26 +21939,26 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_agent_one_as_acq_rel_acquire_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_agent_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -22256,26 +22256,26 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_agent_one_as_seq_cst_acquire_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_agent_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -22573,26 +22573,26 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_agent_one_as_monotonic_seq_cst_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_agent_one_as_monotonic_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -22886,26 +22886,26 @@ define amdgpu_kernel void @global_agent_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_agent_one_as_acquire_seq_cst_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_agent_one_as_acquire_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -23203,26 +23203,26 @@ define amdgpu_kernel void @global_agent_one_as_release_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_agent_one_as_release_seq_cst_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_agent_one_as_release_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -23520,26 +23520,26 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_agent_one_as_acq_rel_seq_cst_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_agent_one_as_acq_rel_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -23837,26 +23837,26 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_agent_one_as_seq_cst_seq_cst_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_agent_one_as_seq_cst_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-lastuse.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-lastuse.ll index bda702156905a..ca7802d295e0b 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-lastuse.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-lastuse.ll @@ -87,16 +87,16 @@ define amdgpu_kernel void @global_last_use_and_volatile_load(ptr addrspace(1) %i ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_last_use_and_volatile_load: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: global_load_b32 v1, v0, s[2:3] th:TH_LOAD_BYPASS scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_last_use_and_volatile_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_load_b32 v1, v0, s[2:3] th:TH_LOAD_BYPASS scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm entry: %val = load volatile i32, ptr addrspace(1) %in, align 4, !amdgpu.last.use !{} store i32 %val, ptr addrspace(1) %out diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-nontemporal.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-nontemporal.ll index 4575cbbfd839e..d74c230488ea2 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-nontemporal.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-nontemporal.ll @@ -1105,16 +1105,16 @@ define amdgpu_kernel void @global_nontemporal_volatile_load( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_nontemporal_volatile_load: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: global_load_b32 v1, v0, s[2:3] th:TH_LOAD_NT scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_nontemporal_volatile_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_load_b32 v1, v0, s[2:3] th:TH_LOAD_NT scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { entry: %val = load volatile i32, ptr addrspace(1) %in, align 4, !nontemporal !0 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-singlethread.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-singlethread.ll index 4f2ea4493560f..e7f7b1d196be7 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-singlethread.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-singlethread.ll @@ -945,16 +945,16 @@ define amdgpu_kernel void @global_singlethread_unordered_store( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_singlethread_unordered_store: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_singlethread_unordered_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { entry: store atomic i32 %in, ptr addrspace(1) %out syncscope("singlethread") unordered, align 4 @@ -1112,16 +1112,16 @@ define amdgpu_kernel void @global_singlethread_monotonic_store( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_singlethread_monotonic_store: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_singlethread_monotonic_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { entry: store atomic i32 %in, ptr addrspace(1) %out syncscope("singlethread") monotonic, align 4 @@ -1279,16 +1279,16 @@ define amdgpu_kernel void @global_singlethread_release_store( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_singlethread_release_store: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_singlethread_release_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { entry: store atomic i32 %in, ptr addrspace(1) %out syncscope("singlethread") release, align 4 @@ -1446,16 +1446,16 @@ define amdgpu_kernel void @global_singlethread_seq_cst_store( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_singlethread_seq_cst_store: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_singlethread_seq_cst_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { entry: store atomic i32 %in, ptr addrspace(1) %out syncscope("singlethread") seq_cst, align 4 @@ -1611,16 +1611,16 @@ define amdgpu_kernel void @global_singlethread_monotonic_atomicrmw( ; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_singlethread_monotonic_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_singlethread_monotonic_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("singlethread") monotonic @@ -1776,16 +1776,16 @@ define amdgpu_kernel void @global_singlethread_acquire_atomicrmw( ; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_singlethread_acquire_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_singlethread_acquire_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("singlethread") acquire @@ -1941,16 +1941,16 @@ define amdgpu_kernel void @global_singlethread_release_atomicrmw( ; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_singlethread_release_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_singlethread_release_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("singlethread") release @@ -2106,16 +2106,16 @@ define amdgpu_kernel void @global_singlethread_acq_rel_atomicrmw( ; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_singlethread_acq_rel_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_singlethread_acq_rel_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("singlethread") acq_rel @@ -2271,16 +2271,16 @@ define amdgpu_kernel void @global_singlethread_seq_cst_atomicrmw( ; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_singlethread_seq_cst_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_singlethread_seq_cst_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("singlethread") seq_cst @@ -2464,18 +2464,18 @@ define amdgpu_kernel void @global_singlethread_acquire_ret_atomicrmw( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_singlethread_acquire_ret_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_singlethread_acquire_ret_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("singlethread") acquire @@ -2660,18 +2660,18 @@ define amdgpu_kernel void @global_singlethread_acq_rel_ret_atomicrmw( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_singlethread_acq_rel_ret_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_singlethread_acq_rel_ret_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("singlethread") acq_rel @@ -2856,18 +2856,18 @@ define amdgpu_kernel void @global_singlethread_seq_cst_ret_atomicrmw( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_singlethread_seq_cst_ret_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_singlethread_seq_cst_ret_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("singlethread") seq_cst @@ -3088,20 +3088,20 @@ define amdgpu_kernel void @global_singlethread_monotonic_monotonic_cmpxchg( ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_singlethread_monotonic_monotonic_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_singlethread_monotonic_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -3322,20 +3322,20 @@ define amdgpu_kernel void @global_singlethread_acquire_monotonic_cmpxchg( ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_singlethread_acquire_monotonic_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_singlethread_acquire_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -3556,20 +3556,20 @@ define amdgpu_kernel void @global_singlethread_release_monotonic_cmpxchg( ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_singlethread_release_monotonic_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_singlethread_release_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -3790,20 +3790,20 @@ define amdgpu_kernel void @global_singlethread_acq_rel_monotonic_cmpxchg( ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_singlethread_acq_rel_monotonic_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_singlethread_acq_rel_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -4024,20 +4024,20 @@ define amdgpu_kernel void @global_singlethread_seq_cst_monotonic_cmpxchg( ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_singlethread_seq_cst_monotonic_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_singlethread_seq_cst_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -4258,20 +4258,20 @@ define amdgpu_kernel void @global_singlethread_monotonic_acquire_cmpxchg( ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_singlethread_monotonic_acquire_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_singlethread_monotonic_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -4492,20 +4492,20 @@ define amdgpu_kernel void @global_singlethread_acquire_acquire_cmpxchg( ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_singlethread_acquire_acquire_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_singlethread_acquire_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -4726,20 +4726,20 @@ define amdgpu_kernel void @global_singlethread_release_acquire_cmpxchg( ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_singlethread_release_acquire_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_singlethread_release_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -4960,20 +4960,20 @@ define amdgpu_kernel void @global_singlethread_acq_rel_acquire_cmpxchg( ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_singlethread_acq_rel_acquire_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_singlethread_acq_rel_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -5194,20 +5194,20 @@ define amdgpu_kernel void @global_singlethread_seq_cst_acquire_cmpxchg( ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_singlethread_seq_cst_acquire_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_singlethread_seq_cst_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -5428,20 +5428,20 @@ define amdgpu_kernel void @global_singlethread_monotonic_seq_cst_cmpxchg( ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_singlethread_monotonic_seq_cst_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_singlethread_monotonic_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -5662,20 +5662,20 @@ define amdgpu_kernel void @global_singlethread_acquire_seq_cst_cmpxchg( ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_singlethread_acquire_seq_cst_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_singlethread_acquire_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -5896,20 +5896,20 @@ define amdgpu_kernel void @global_singlethread_release_seq_cst_cmpxchg( ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_singlethread_release_seq_cst_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_singlethread_release_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -6130,20 +6130,20 @@ define amdgpu_kernel void @global_singlethread_acq_rel_seq_cst_cmpxchg( ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_singlethread_acq_rel_seq_cst_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_singlethread_acq_rel_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -6364,20 +6364,20 @@ define amdgpu_kernel void @global_singlethread_seq_cst_seq_cst_cmpxchg( ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_singlethread_seq_cst_seq_cst_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_singlethread_seq_cst_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -6628,22 +6628,22 @@ define amdgpu_kernel void @global_singlethread_monotonic_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_singlethread_monotonic_monotonic_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_singlethread_monotonic_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -6896,22 +6896,22 @@ define amdgpu_kernel void @global_singlethread_acquire_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_singlethread_acquire_monotonic_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_singlethread_acquire_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -7164,22 +7164,22 @@ define amdgpu_kernel void @global_singlethread_release_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_singlethread_release_monotonic_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_singlethread_release_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -7432,22 +7432,22 @@ define amdgpu_kernel void @global_singlethread_acq_rel_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_singlethread_acq_rel_monotonic_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_singlethread_acq_rel_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -7700,22 +7700,22 @@ define amdgpu_kernel void @global_singlethread_seq_cst_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_singlethread_seq_cst_monotonic_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_singlethread_seq_cst_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -7968,22 +7968,22 @@ define amdgpu_kernel void @global_singlethread_monotonic_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_singlethread_monotonic_acquire_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_singlethread_monotonic_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -8236,22 +8236,22 @@ define amdgpu_kernel void @global_singlethread_acquire_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_singlethread_acquire_acquire_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_singlethread_acquire_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -8504,22 +8504,22 @@ define amdgpu_kernel void @global_singlethread_release_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_singlethread_release_acquire_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_singlethread_release_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -8772,22 +8772,22 @@ define amdgpu_kernel void @global_singlethread_acq_rel_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_singlethread_acq_rel_acquire_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_singlethread_acq_rel_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -9040,22 +9040,22 @@ define amdgpu_kernel void @global_singlethread_seq_cst_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_singlethread_seq_cst_acquire_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_singlethread_seq_cst_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -9308,22 +9308,22 @@ define amdgpu_kernel void @global_singlethread_monotonic_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_singlethread_monotonic_seq_cst_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_singlethread_monotonic_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -9576,22 +9576,22 @@ define amdgpu_kernel void @global_singlethread_acquire_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_singlethread_acquire_seq_cst_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_singlethread_acquire_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -9844,22 +9844,22 @@ define amdgpu_kernel void @global_singlethread_release_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_singlethread_release_seq_cst_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_singlethread_release_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -10112,22 +10112,22 @@ define amdgpu_kernel void @global_singlethread_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_singlethread_acq_rel_seq_cst_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_singlethread_acq_rel_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -10380,22 +10380,22 @@ define amdgpu_kernel void @global_singlethread_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_singlethread_seq_cst_seq_cst_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_singlethread_seq_cst_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -11336,16 +11336,16 @@ define amdgpu_kernel void @global_singlethread_one_as_unordered_store( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_singlethread_one_as_unordered_store: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_singlethread_one_as_unordered_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { entry: store atomic i32 %in, ptr addrspace(1) %out syncscope("singlethread-one-as") unordered, align 4 @@ -11503,16 +11503,16 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_store( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_singlethread_one_as_monotonic_store: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_singlethread_one_as_monotonic_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { entry: store atomic i32 %in, ptr addrspace(1) %out syncscope("singlethread-one-as") monotonic, align 4 @@ -11670,16 +11670,16 @@ define amdgpu_kernel void @global_singlethread_one_as_release_store( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_singlethread_one_as_release_store: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_singlethread_one_as_release_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { entry: store atomic i32 %in, ptr addrspace(1) %out syncscope("singlethread-one-as") release, align 4 @@ -11837,16 +11837,16 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_store( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_singlethread_one_as_seq_cst_store: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_singlethread_one_as_seq_cst_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { entry: store atomic i32 %in, ptr addrspace(1) %out syncscope("singlethread-one-as") seq_cst, align 4 @@ -12002,16 +12002,16 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_atomicrmw( ; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_singlethread_one_as_monotonic_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_singlethread_one_as_monotonic_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("singlethread-one-as") monotonic @@ -12167,16 +12167,16 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_atomicrmw( ; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_singlethread_one_as_acquire_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_singlethread_one_as_acquire_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("singlethread-one-as") acquire @@ -12332,16 +12332,16 @@ define amdgpu_kernel void @global_singlethread_one_as_release_atomicrmw( ; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_singlethread_one_as_release_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_singlethread_one_as_release_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("singlethread-one-as") release @@ -12497,16 +12497,16 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_atomicrmw( ; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_singlethread_one_as_acq_rel_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_singlethread_one_as_acq_rel_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("singlethread-one-as") acq_rel @@ -12662,16 +12662,16 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_atomicrmw( ; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_singlethread_one_as_seq_cst_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_singlethread_one_as_seq_cst_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("singlethread-one-as") seq_cst @@ -12855,18 +12855,18 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_ret_atomicrmw( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_singlethread_one_as_acquire_ret_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_singlethread_one_as_acquire_ret_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("singlethread-one-as") acquire @@ -13051,18 +13051,18 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_ret_atomicrmw( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_singlethread_one_as_acq_rel_ret_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_singlethread_one_as_acq_rel_ret_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("singlethread-one-as") acq_rel @@ -13247,18 +13247,18 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_ret_atomicrmw( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_singlethread_one_as_seq_cst_ret_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_singlethread_one_as_seq_cst_ret_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("singlethread-one-as") seq_cst @@ -13479,20 +13479,20 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_monotonic_cmpxch ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_singlethread_one_as_monotonic_monotonic_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_singlethread_one_as_monotonic_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -13713,20 +13713,20 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_monotonic_cmpxchg( ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_singlethread_one_as_acquire_monotonic_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_singlethread_one_as_acquire_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -13947,20 +13947,20 @@ define amdgpu_kernel void @global_singlethread_one_as_release_monotonic_cmpxchg( ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_singlethread_one_as_release_monotonic_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_singlethread_one_as_release_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -14181,20 +14181,20 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_monotonic_cmpxchg( ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_singlethread_one_as_acq_rel_monotonic_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_singlethread_one_as_acq_rel_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -14415,20 +14415,20 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_monotonic_cmpxchg( ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_singlethread_one_as_seq_cst_monotonic_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_singlethread_one_as_seq_cst_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -14649,20 +14649,20 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_acquire_cmpxchg( ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_singlethread_one_as_monotonic_acquire_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_singlethread_one_as_monotonic_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -14883,20 +14883,20 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_acquire_cmpxchg( ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_singlethread_one_as_acquire_acquire_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_singlethread_one_as_acquire_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -15117,20 +15117,20 @@ define amdgpu_kernel void @global_singlethread_one_as_release_acquire_cmpxchg( ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_singlethread_one_as_release_acquire_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_singlethread_one_as_release_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -15351,20 +15351,20 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_acquire_cmpxchg( ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_singlethread_one_as_acq_rel_acquire_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_singlethread_one_as_acq_rel_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -15585,20 +15585,20 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_acquire_cmpxchg( ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_singlethread_one_as_seq_cst_acquire_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_singlethread_one_as_seq_cst_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -15819,20 +15819,20 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_seq_cst_cmpxchg( ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_singlethread_one_as_monotonic_seq_cst_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_singlethread_one_as_monotonic_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -16053,20 +16053,20 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_seq_cst_cmpxchg( ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_singlethread_one_as_acquire_seq_cst_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_singlethread_one_as_acquire_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -16287,20 +16287,20 @@ define amdgpu_kernel void @global_singlethread_one_as_release_seq_cst_cmpxchg( ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_singlethread_one_as_release_seq_cst_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_singlethread_one_as_release_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -16521,20 +16521,20 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_seq_cst_cmpxchg( ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_singlethread_one_as_acq_rel_seq_cst_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_singlethread_one_as_acq_rel_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -16755,20 +16755,20 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_seq_cst_cmpxchg( ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_singlethread_one_as_seq_cst_seq_cst_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_singlethread_one_as_seq_cst_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -17019,22 +17019,22 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_monotonic_ret_cm ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_singlethread_one_as_monotonic_monotonic_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_singlethread_one_as_monotonic_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -17287,22 +17287,22 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_monotonic_ret_cmpx ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_singlethread_one_as_acquire_monotonic_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_singlethread_one_as_acquire_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -17555,22 +17555,22 @@ define amdgpu_kernel void @global_singlethread_one_as_release_monotonic_ret_cmpx ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_singlethread_one_as_release_monotonic_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_singlethread_one_as_release_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -17823,22 +17823,22 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_monotonic_ret_cmpx ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -18091,22 +18091,22 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_monotonic_ret_cmpx ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -18359,22 +18359,22 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_acquire_ret_cmpx ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_singlethread_one_as_monotonic_acquire_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_singlethread_one_as_monotonic_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -18627,22 +18627,22 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_acquire_ret_cmpxch ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_singlethread_one_as_acquire_acquire_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_singlethread_one_as_acquire_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -18895,22 +18895,22 @@ define amdgpu_kernel void @global_singlethread_one_as_release_acquire_ret_cmpxch ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_singlethread_one_as_release_acquire_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_singlethread_one_as_release_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -19163,22 +19163,22 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_acquire_ret_cmpxch ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_singlethread_one_as_acq_rel_acquire_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_singlethread_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -19431,22 +19431,22 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_acquire_ret_cmpxch ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_singlethread_one_as_seq_cst_acquire_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_singlethread_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -19699,22 +19699,22 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_seq_cst_ret_cmpx ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -19967,22 +19967,22 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_seq_cst_ret_cmpxch ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_singlethread_one_as_acquire_seq_cst_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_singlethread_one_as_acquire_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -20235,22 +20235,22 @@ define amdgpu_kernel void @global_singlethread_one_as_release_seq_cst_ret_cmpxch ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_singlethread_one_as_release_seq_cst_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_singlethread_one_as_release_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -20503,22 +20503,22 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_seq_cst_ret_cmpxch ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -20771,22 +20771,22 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_seq_cst_ret_cmpxch ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-system.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-system.ll index c8a45deccb462..e7880a81800fd 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-system.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-system.ll @@ -833,19 +833,19 @@ define amdgpu_kernel void @global_system_seq_cst_load( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_system_seq_cst_load: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_system_seq_cst_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { entry: %val = load atomic i32, ptr addrspace(1) %in seq_cst, align 4 @@ -1004,16 +1004,16 @@ define amdgpu_kernel void @global_system_unordered_store( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_system_unordered_store: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_system_unordered_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { entry: store atomic i32 %in, ptr addrspace(1) %out unordered, align 4 @@ -1171,16 +1171,16 @@ define amdgpu_kernel void @global_system_monotonic_store( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_system_monotonic_store: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_system_monotonic_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { entry: store atomic i32 %in, ptr addrspace(1) %out monotonic, align 4 @@ -1367,19 +1367,19 @@ define amdgpu_kernel void @global_system_release_store( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_system_release_store: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_system_release_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { entry: store atomic i32 %in, ptr addrspace(1) %out release, align 4 @@ -1566,19 +1566,19 @@ define amdgpu_kernel void @global_system_seq_cst_store( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_system_seq_cst_store: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_system_seq_cst_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { entry: store atomic i32 %in, ptr addrspace(1) %out seq_cst, align 4 @@ -1734,16 +1734,16 @@ define amdgpu_kernel void @global_system_monotonic_atomicrmw( ; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_system_monotonic_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_system_monotonic_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in monotonic @@ -1930,18 +1930,18 @@ define amdgpu_kernel void @global_system_acquire_atomicrmw( ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_system_acquire_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_system_acquire_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in acquire @@ -2126,19 +2126,19 @@ define amdgpu_kernel void @global_system_release_atomicrmw( ; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_system_release_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_system_release_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in release @@ -2354,21 +2354,21 @@ define amdgpu_kernel void @global_system_acq_rel_atomicrmw( ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_system_acq_rel_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_system_acq_rel_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in acq_rel @@ -2584,21 +2584,21 @@ define amdgpu_kernel void @global_system_seq_cst_atomicrmw( ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_system_seq_cst_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_system_seq_cst_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in seq_cst @@ -2800,19 +2800,19 @@ define amdgpu_kernel void @global_system_acquire_ret_atomicrmw( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_system_acquire_ret_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_system_acquire_ret_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in acquire @@ -3048,22 +3048,22 @@ define amdgpu_kernel void @global_system_acq_rel_ret_atomicrmw( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_system_acq_rel_ret_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_system_acq_rel_ret_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in acq_rel @@ -3299,22 +3299,22 @@ define amdgpu_kernel void @global_system_seq_cst_ret_atomicrmw( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_system_seq_cst_ret_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_system_seq_cst_ret_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in seq_cst @@ -3535,20 +3535,20 @@ define amdgpu_kernel void @global_system_monotonic_monotonic_cmpxchg( ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_system_monotonic_monotonic_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_system_monotonic_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -3800,22 +3800,22 @@ define amdgpu_kernel void @global_system_acquire_monotonic_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_system_acquire_monotonic_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_system_acquire_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -4065,23 +4065,23 @@ define amdgpu_kernel void @global_system_release_monotonic_cmpxchg( ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_system_release_monotonic_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_system_release_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -4362,25 +4362,25 @@ define amdgpu_kernel void @global_system_acq_rel_monotonic_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_system_acq_rel_monotonic_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_system_acq_rel_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -4661,25 +4661,25 @@ define amdgpu_kernel void @global_system_seq_cst_monotonic_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_system_seq_cst_monotonic_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_system_seq_cst_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -4931,22 +4931,22 @@ define amdgpu_kernel void @global_system_monotonic_acquire_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_system_monotonic_acquire_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_system_monotonic_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -5198,22 +5198,22 @@ define amdgpu_kernel void @global_system_acquire_acquire_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_system_acquire_acquire_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_system_acquire_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -5494,25 +5494,25 @@ define amdgpu_kernel void @global_system_release_acquire_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_system_release_acquire_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_system_release_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -5793,25 +5793,25 @@ define amdgpu_kernel void @global_system_acq_rel_acquire_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_system_acq_rel_acquire_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_system_acq_rel_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -6092,25 +6092,25 @@ define amdgpu_kernel void @global_system_seq_cst_acquire_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_system_seq_cst_acquire_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_system_seq_cst_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -6391,25 +6391,25 @@ define amdgpu_kernel void @global_system_seq_cst_seq_cst_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_system_seq_cst_seq_cst_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_system_seq_cst_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -6660,22 +6660,22 @@ define amdgpu_kernel void @global_system_monotonic_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_system_monotonic_monotonic_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_system_monotonic_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -6946,23 +6946,23 @@ define amdgpu_kernel void @global_system_acquire_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_system_acquire_monotonic_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_system_acquire_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -7266,26 +7266,26 @@ define amdgpu_kernel void @global_system_acq_rel_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_system_acq_rel_monotonic_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_system_acq_rel_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -7589,26 +7589,26 @@ define amdgpu_kernel void @global_system_seq_cst_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_system_seq_cst_monotonic_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_system_seq_cst_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -7883,23 +7883,23 @@ define amdgpu_kernel void @global_system_monotonic_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_system_monotonic_acquire_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_system_monotonic_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -8170,23 +8170,23 @@ define amdgpu_kernel void @global_system_acquire_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_system_acquire_acquire_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_system_acquire_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -8490,26 +8490,26 @@ define amdgpu_kernel void @global_system_release_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_system_release_acquire_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_system_release_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -8813,26 +8813,26 @@ define amdgpu_kernel void @global_system_acq_rel_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_system_acq_rel_acquire_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_system_acq_rel_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -9136,26 +9136,26 @@ define amdgpu_kernel void @global_system_seq_cst_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_system_seq_cst_acquire_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_system_seq_cst_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -9459,26 +9459,26 @@ define amdgpu_kernel void @global_system_monotonic_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_system_monotonic_seq_cst_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_system_monotonic_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -9778,26 +9778,26 @@ define amdgpu_kernel void @global_system_acquire_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_system_acquire_seq_cst_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_system_acquire_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -10101,26 +10101,26 @@ define amdgpu_kernel void @global_system_relese_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_system_relese_seq_cst_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_system_relese_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -10424,26 +10424,26 @@ define amdgpu_kernel void @global_system_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_system_acq_rel_seq_cst_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_system_acq_rel_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -10747,26 +10747,26 @@ define amdgpu_kernel void @global_system_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_system_seq_cst_seq_cst_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_system_seq_cst_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -11595,19 +11595,19 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_load( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_system_one_as_seq_cst_load: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_system_one_as_seq_cst_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { entry: %val = load atomic i32, ptr addrspace(1) %in syncscope("one-as") seq_cst, align 4 @@ -11766,16 +11766,16 @@ define amdgpu_kernel void @global_system_one_as_unordered_store( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_system_one_as_unordered_store: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_system_one_as_unordered_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { entry: store atomic i32 %in, ptr addrspace(1) %out syncscope("one-as") unordered, align 4 @@ -11933,16 +11933,16 @@ define amdgpu_kernel void @global_system_one_as_monotonic_store( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_system_one_as_monotonic_store: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_system_one_as_monotonic_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { entry: store atomic i32 %in, ptr addrspace(1) %out syncscope("one-as") monotonic, align 4 @@ -12129,19 +12129,19 @@ define amdgpu_kernel void @global_system_one_as_release_store( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_system_one_as_release_store: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_system_one_as_release_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { entry: store atomic i32 %in, ptr addrspace(1) %out syncscope("one-as") release, align 4 @@ -12328,19 +12328,19 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_store( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_system_one_as_seq_cst_store: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_system_one_as_seq_cst_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { entry: store atomic i32 %in, ptr addrspace(1) %out syncscope("one-as") seq_cst, align 4 @@ -12496,16 +12496,16 @@ define amdgpu_kernel void @global_system_one_as_monotonic_atomicrmw( ; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_system_one_as_monotonic_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_system_one_as_monotonic_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("one-as") monotonic @@ -12692,18 +12692,18 @@ define amdgpu_kernel void @global_system_one_as_acquire_atomicrmw( ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_system_one_as_acquire_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_system_one_as_acquire_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("one-as") acquire @@ -12888,19 +12888,19 @@ define amdgpu_kernel void @global_system_one_as_release_atomicrmw( ; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_system_one_as_release_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_system_one_as_release_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("one-as") release @@ -13116,21 +13116,21 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_atomicrmw( ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_system_one_as_acq_rel_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_system_one_as_acq_rel_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("one-as") acq_rel @@ -13346,21 +13346,21 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_atomicrmw( ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_system_one_as_seq_cst_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_system_one_as_seq_cst_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("one-as") seq_cst @@ -13562,19 +13562,19 @@ define amdgpu_kernel void @global_system_one_as_acquire_ret_atomicrmw( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_system_one_as_acquire_ret_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_system_one_as_acquire_ret_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("one-as") acquire @@ -13810,22 +13810,22 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_ret_atomicrmw( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_system_one_as_acq_rel_ret_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_system_one_as_acq_rel_ret_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("one-as") acq_rel @@ -14061,22 +14061,22 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_ret_atomicrmw( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_system_one_as_seq_cst_ret_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_system_one_as_seq_cst_ret_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("one-as") seq_cst @@ -14297,20 +14297,20 @@ define amdgpu_kernel void @global_system_one_as_monotonic_monotonic_cmpxchg( ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_system_one_as_monotonic_monotonic_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_system_one_as_monotonic_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -14562,22 +14562,22 @@ define amdgpu_kernel void @global_system_one_as_acquire_monotonic_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_system_one_as_acquire_monotonic_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_system_one_as_acquire_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -14827,23 +14827,23 @@ define amdgpu_kernel void @global_system_one_as_release_monotonic_cmpxchg( ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_system_one_as_release_monotonic_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_system_one_as_release_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -15124,25 +15124,25 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_monotonic_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_system_one_as_acq_rel_monotonic_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_system_one_as_acq_rel_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -15423,25 +15423,25 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_monotonic_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_system_one_as_seq_cst_monotonic_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_system_one_as_seq_cst_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -15693,22 +15693,22 @@ define amdgpu_kernel void @global_system_one_as_monotonic_acquire_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_system_one_as_monotonic_acquire_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_system_one_as_monotonic_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -15960,22 +15960,22 @@ define amdgpu_kernel void @global_system_one_as_acquire_acquire_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_system_one_as_acquire_acquire_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_system_one_as_acquire_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -16256,25 +16256,25 @@ define amdgpu_kernel void @global_system_one_as_release_acquire_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_system_one_as_release_acquire_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_system_one_as_release_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -16555,25 +16555,25 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_acquire_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_system_one_as_acq_rel_acquire_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_system_one_as_acq_rel_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -16854,25 +16854,25 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_acquire_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_system_one_as_seq_cst_acquire_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_system_one_as_seq_cst_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -17153,25 +17153,25 @@ define amdgpu_kernel void @global_system_one_as_monotonic_seq_cst_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_system_one_as_monotonic_seq_cst_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_system_one_as_monotonic_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -17452,25 +17452,25 @@ define amdgpu_kernel void @global_system_one_as_acquire_seq_cst_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_system_one_as_acquire_seq_cst_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_system_one_as_acquire_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -17751,25 +17751,25 @@ define amdgpu_kernel void @global_system_one_as_release_seq_cst_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_system_one_as_release_seq_cst_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_system_one_as_release_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -18050,25 +18050,25 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_seq_cst_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_system_one_as_acq_rel_seq_cst_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_system_one_as_acq_rel_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -18349,25 +18349,25 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_seq_cst_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_system_one_as_seq_cst_seq_cst_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_system_one_as_seq_cst_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -18618,22 +18618,22 @@ define amdgpu_kernel void @global_system_one_as_monotonic_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_system_one_as_monotonic_monotonic_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_system_one_as_monotonic_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -18904,23 +18904,23 @@ define amdgpu_kernel void @global_system_one_as_acquire_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_system_one_as_acquire_monotonic_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_system_one_as_acquire_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -19202,25 +19202,25 @@ define amdgpu_kernel void @global_system_one_as_release_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_system_one_as_release_monotonic_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_system_one_as_release_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -19524,26 +19524,26 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_system_one_as_acq_rel_monotonic_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_system_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -19847,26 +19847,26 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_system_one_as_seq_cst_monotonic_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_system_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -20141,23 +20141,23 @@ define amdgpu_kernel void @global_system_one_as_monotonic_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_system_one_as_monotonic_acquire_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_system_one_as_monotonic_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -20428,23 +20428,23 @@ define amdgpu_kernel void @global_system_one_as_acquire_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_system_one_as_acquire_acquire_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_system_one_as_acquire_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -20748,26 +20748,26 @@ define amdgpu_kernel void @global_system_one_as_release_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_system_one_as_release_acquire_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_system_one_as_release_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -21071,26 +21071,26 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_system_one_as_acq_rel_acquire_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_system_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -21394,26 +21394,26 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_system_one_as_seq_cst_acquire_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_system_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -21717,26 +21717,26 @@ define amdgpu_kernel void @global_system_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_system_one_as_monotonic_seq_cst_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_system_one_as_monotonic_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -22036,26 +22036,26 @@ define amdgpu_kernel void @global_system_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_system_one_as_acquire_seq_cst_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_system_one_as_acquire_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -22359,26 +22359,26 @@ define amdgpu_kernel void @global_system_one_as_release_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_system_one_as_release_seq_cst_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_system_one_as_release_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -22682,26 +22682,26 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_system_one_as_acq_rel_seq_cst_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_system_one_as_acq_rel_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -23005,26 +23005,26 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_system_one_as_seq_cst_seq_cst_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_system_one_as_seq_cst_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-volatile.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-volatile.ll index f4fdec7490117..3bf5ed8b2397f 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-volatile.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-volatile.ll @@ -148,16 +148,16 @@ define amdgpu_kernel void @global_volatile_load_0( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_volatile_load_0: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_volatile_load_0: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { entry: %val = load volatile i32, ptr addrspace(1) %in, align 4 @@ -355,20 +355,20 @@ define amdgpu_kernel void @global_volatile_load_1( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_volatile_load_1: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, v0 -; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: s_mov_b32 s4, 0x3ff -; GFX1250-CU-NEXT: v_and_b32_e64 v1, v1, s4 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: global_load_b32 v1, v1, s[2:3] scale_offset scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_volatile_load_1: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v1, v0 +; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_mov_b32 s4, 0x3ff +; GFX1250-NEXT: v_and_b32_e64 v1, v1, s4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_load_b32 v1, v1, s[2:3] scale_offset scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -1030,18 +1030,18 @@ define amdgpu_kernel void @global_volatile_workgroup_release_store( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_volatile_workgroup_release_store: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_volatile_workgroup_release_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { entry: store atomic volatile i32 %in, ptr addrspace(1) %out syncscope("workgroup") release, align 4 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-wavefront.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-wavefront.ll index f66e6d00e6eab..09eb062d876f6 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-wavefront.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-wavefront.ll @@ -945,16 +945,16 @@ define amdgpu_kernel void @global_wavefront_unordered_store( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_wavefront_unordered_store: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_wavefront_unordered_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { entry: store atomic i32 %in, ptr addrspace(1) %out syncscope("wavefront") unordered, align 4 @@ -1112,16 +1112,16 @@ define amdgpu_kernel void @global_wavefront_monotonic_store( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_wavefront_monotonic_store: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_wavefront_monotonic_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { entry: store atomic i32 %in, ptr addrspace(1) %out syncscope("wavefront") monotonic, align 4 @@ -1279,16 +1279,16 @@ define amdgpu_kernel void @global_wavefront_release_store( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_wavefront_release_store: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_wavefront_release_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { entry: store atomic i32 %in, ptr addrspace(1) %out syncscope("wavefront") release, align 4 @@ -1446,16 +1446,16 @@ define amdgpu_kernel void @global_wavefront_seq_cst_store( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_wavefront_seq_cst_store: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_wavefront_seq_cst_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { entry: store atomic i32 %in, ptr addrspace(1) %out syncscope("wavefront") seq_cst, align 4 @@ -1611,16 +1611,16 @@ define amdgpu_kernel void @global_wavefront_monotonic_atomicrmw( ; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_wavefront_monotonic_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_wavefront_monotonic_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("wavefront") monotonic @@ -1776,16 +1776,16 @@ define amdgpu_kernel void @global_wavefront_acquire_atomicrmw( ; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_wavefront_acquire_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_wavefront_acquire_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("wavefront") acquire @@ -1941,16 +1941,16 @@ define amdgpu_kernel void @global_wavefront_release_atomicrmw( ; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_wavefront_release_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_wavefront_release_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("wavefront") release @@ -2106,16 +2106,16 @@ define amdgpu_kernel void @global_wavefront_acq_rel_atomicrmw( ; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_wavefront_acq_rel_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_wavefront_acq_rel_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("wavefront") acq_rel @@ -2271,16 +2271,16 @@ define amdgpu_kernel void @global_wavefront_seq_cst_atomicrmw( ; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_wavefront_seq_cst_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_wavefront_seq_cst_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("wavefront") seq_cst @@ -2464,18 +2464,18 @@ define amdgpu_kernel void @global_wavefront_acquire_ret_atomicrmw( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_wavefront_acquire_ret_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_wavefront_acquire_ret_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("wavefront") acquire @@ -2660,18 +2660,18 @@ define amdgpu_kernel void @global_wavefront_acq_rel_ret_atomicrmw( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_wavefront_acq_rel_ret_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_wavefront_acq_rel_ret_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("wavefront") acq_rel @@ -2856,18 +2856,18 @@ define amdgpu_kernel void @global_wavefront_seq_cst_ret_atomicrmw( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_wavefront_seq_cst_ret_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_wavefront_seq_cst_ret_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("wavefront") seq_cst @@ -3088,20 +3088,20 @@ define amdgpu_kernel void @global_wavefront_monotonic_monotonic_cmpxchg( ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_wavefront_monotonic_monotonic_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_wavefront_monotonic_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -3322,20 +3322,20 @@ define amdgpu_kernel void @global_wavefront_acquire_monotonic_cmpxchg( ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_wavefront_acquire_monotonic_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_wavefront_acquire_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -3556,20 +3556,20 @@ define amdgpu_kernel void @global_wavefront_release_monotonic_cmpxchg( ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_wavefront_release_monotonic_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_wavefront_release_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -3790,20 +3790,20 @@ define amdgpu_kernel void @global_wavefront_acq_rel_monotonic_cmpxchg( ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_wavefront_acq_rel_monotonic_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_wavefront_acq_rel_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -4024,20 +4024,20 @@ define amdgpu_kernel void @global_wavefront_seq_cst_monotonic_cmpxchg( ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_wavefront_seq_cst_monotonic_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_wavefront_seq_cst_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -4258,20 +4258,20 @@ define amdgpu_kernel void @global_wavefront_monotonic_acquire_cmpxchg( ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_wavefront_monotonic_acquire_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_wavefront_monotonic_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -4492,20 +4492,20 @@ define amdgpu_kernel void @global_wavefront_acquire_acquire_cmpxchg( ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_wavefront_acquire_acquire_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_wavefront_acquire_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -4726,20 +4726,20 @@ define amdgpu_kernel void @global_wavefront_release_acquire_cmpxchg( ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_wavefront_release_acquire_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_wavefront_release_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -4960,20 +4960,20 @@ define amdgpu_kernel void @global_wavefront_acq_rel_acquire_cmpxchg( ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_wavefront_acq_rel_acquire_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_wavefront_acq_rel_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -5194,20 +5194,20 @@ define amdgpu_kernel void @global_wavefront_seq_cst_acquire_cmpxchg( ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_wavefront_seq_cst_acquire_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_wavefront_seq_cst_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -5428,20 +5428,20 @@ define amdgpu_kernel void @global_wavefront_monotonic_seq_cst_cmpxchg( ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_wavefront_monotonic_seq_cst_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_wavefront_monotonic_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -5662,20 +5662,20 @@ define amdgpu_kernel void @global_wavefront_acquire_seq_cst_cmpxchg( ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_wavefront_acquire_seq_cst_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_wavefront_acquire_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -5896,20 +5896,20 @@ define amdgpu_kernel void @global_wavefront_release_seq_cst_cmpxchg( ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_wavefront_release_seq_cst_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_wavefront_release_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -6130,20 +6130,20 @@ define amdgpu_kernel void @global_wavefront_acq_rel_seq_cst_cmpxchg( ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_wavefront_acq_rel_seq_cst_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_wavefront_acq_rel_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -6364,20 +6364,20 @@ define amdgpu_kernel void @global_wavefront_seq_cst_seq_cst_cmpxchg( ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_wavefront_seq_cst_seq_cst_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_wavefront_seq_cst_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -6628,22 +6628,22 @@ define amdgpu_kernel void @global_wavefront_monotonic_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_wavefront_monotonic_monotonic_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_wavefront_monotonic_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -6896,22 +6896,22 @@ define amdgpu_kernel void @global_wavefront_acquire_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_wavefront_acquire_monotonic_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_wavefront_acquire_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -7164,22 +7164,22 @@ define amdgpu_kernel void @global_wavefront_release_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_wavefront_release_monotonic_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_wavefront_release_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -7432,22 +7432,22 @@ define amdgpu_kernel void @global_wavefront_acq_rel_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_wavefront_acq_rel_monotonic_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_wavefront_acq_rel_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -7700,22 +7700,22 @@ define amdgpu_kernel void @global_wavefront_seq_cst_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_wavefront_seq_cst_monotonic_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_wavefront_seq_cst_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -7968,22 +7968,22 @@ define amdgpu_kernel void @global_wavefront_monotonic_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_wavefront_monotonic_acquire_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_wavefront_monotonic_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -8236,22 +8236,22 @@ define amdgpu_kernel void @global_wavefront_acquire_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_wavefront_acquire_acquire_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_wavefront_acquire_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -8504,22 +8504,22 @@ define amdgpu_kernel void @global_wavefront_release_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_wavefront_release_acquire_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_wavefront_release_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -8772,22 +8772,22 @@ define amdgpu_kernel void @global_wavefront_acq_rel_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_wavefront_acq_rel_acquire_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_wavefront_acq_rel_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -9040,22 +9040,22 @@ define amdgpu_kernel void @global_wavefront_seq_cst_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_wavefront_seq_cst_acquire_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_wavefront_seq_cst_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -9308,22 +9308,22 @@ define amdgpu_kernel void @global_wavefront_monotonic_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_wavefront_monotonic_seq_cst_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_wavefront_monotonic_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -9576,22 +9576,22 @@ define amdgpu_kernel void @global_wavefront_acquire_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_wavefront_acquire_seq_cst_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_wavefront_acquire_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -9844,22 +9844,22 @@ define amdgpu_kernel void @global_wavefront_release_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_wavefront_release_seq_cst_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_wavefront_release_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -10112,22 +10112,22 @@ define amdgpu_kernel void @global_wavefront_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_wavefront_acq_rel_seq_cst_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_wavefront_acq_rel_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -10380,22 +10380,22 @@ define amdgpu_kernel void @global_wavefront_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_wavefront_seq_cst_seq_cst_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_wavefront_seq_cst_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -11336,16 +11336,16 @@ define amdgpu_kernel void @global_wavefront_one_as_unordered_store( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_wavefront_one_as_unordered_store: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_wavefront_one_as_unordered_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { entry: store atomic i32 %in, ptr addrspace(1) %out syncscope("wavefront-one-as") unordered, align 4 @@ -11503,16 +11503,16 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_store( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_wavefront_one_as_monotonic_store: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_wavefront_one_as_monotonic_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { entry: store atomic i32 %in, ptr addrspace(1) %out syncscope("wavefront-one-as") monotonic, align 4 @@ -11670,16 +11670,16 @@ define amdgpu_kernel void @global_wavefront_one_as_release_store( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_wavefront_one_as_release_store: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_wavefront_one_as_release_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { entry: store atomic i32 %in, ptr addrspace(1) %out syncscope("wavefront-one-as") release, align 4 @@ -11837,16 +11837,16 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_store( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_wavefront_one_as_seq_cst_store: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_wavefront_one_as_seq_cst_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { entry: store atomic i32 %in, ptr addrspace(1) %out syncscope("wavefront-one-as") seq_cst, align 4 @@ -12002,16 +12002,16 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_atomicrmw( ; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_wavefront_one_as_monotonic_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_wavefront_one_as_monotonic_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("wavefront-one-as") monotonic @@ -12167,16 +12167,16 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_atomicrmw( ; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_wavefront_one_as_acquire_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_wavefront_one_as_acquire_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("wavefront-one-as") acquire @@ -12332,16 +12332,16 @@ define amdgpu_kernel void @global_wavefront_one_as_release_atomicrmw( ; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_wavefront_one_as_release_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_wavefront_one_as_release_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("wavefront-one-as") release @@ -12497,16 +12497,16 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_atomicrmw( ; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_wavefront_one_as_acq_rel_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_wavefront_one_as_acq_rel_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("wavefront-one-as") acq_rel @@ -12662,16 +12662,16 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_atomicrmw( ; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_wavefront_one_as_seq_cst_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_wavefront_one_as_seq_cst_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("wavefront-one-as") seq_cst @@ -12855,18 +12855,18 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_ret_atomicrmw( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_wavefront_one_as_acquire_ret_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_wavefront_one_as_acquire_ret_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("wavefront-one-as") acquire @@ -13051,18 +13051,18 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_ret_atomicrmw( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_wavefront_one_as_acq_rel_ret_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_wavefront_one_as_acq_rel_ret_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("wavefront-one-as") acq_rel @@ -13247,18 +13247,18 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_ret_atomicrmw( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_wavefront_one_as_seq_cst_ret_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_wavefront_one_as_seq_cst_ret_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("wavefront-one-as") seq_cst @@ -13479,20 +13479,20 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_monotonic_cmpxchg( ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_wavefront_one_as_monotonic_monotonic_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_wavefront_one_as_monotonic_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -13713,20 +13713,20 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_monotonic_cmpxchg( ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_wavefront_one_as_acquire_monotonic_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_wavefront_one_as_acquire_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -13947,20 +13947,20 @@ define amdgpu_kernel void @global_wavefront_one_as_release_monotonic_cmpxchg( ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_wavefront_one_as_release_monotonic_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_wavefront_one_as_release_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -14181,20 +14181,20 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_monotonic_cmpxchg( ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_wavefront_one_as_acq_rel_monotonic_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_wavefront_one_as_acq_rel_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -14415,20 +14415,20 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_monotonic_cmpxchg( ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_wavefront_one_as_seq_cst_monotonic_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_wavefront_one_as_seq_cst_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -14649,20 +14649,20 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_acquire_cmpxchg( ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_wavefront_one_as_monotonic_acquire_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_wavefront_one_as_monotonic_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -14883,20 +14883,20 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_acquire_cmpxchg( ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_wavefront_one_as_acquire_acquire_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_wavefront_one_as_acquire_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -15117,20 +15117,20 @@ define amdgpu_kernel void @global_wavefront_one_as_release_acquire_cmpxchg( ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_wavefront_one_as_release_acquire_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_wavefront_one_as_release_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -15351,20 +15351,20 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_acquire_cmpxchg( ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_wavefront_one_as_acq_rel_acquire_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_wavefront_one_as_acq_rel_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -15585,20 +15585,20 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_acquire_cmpxchg( ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_wavefront_one_as_seq_cst_acquire_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_wavefront_one_as_seq_cst_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -15819,20 +15819,20 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_seq_cst_cmpxchg( ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_wavefront_one_as_monotonic_seq_cst_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_wavefront_one_as_monotonic_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -16053,20 +16053,20 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_seq_cst_cmpxchg( ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_wavefront_one_as_acquire_seq_cst_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_wavefront_one_as_acquire_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -16287,20 +16287,20 @@ define amdgpu_kernel void @global_wavefront_one_as_release_seq_cst_cmpxchg( ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_wavefront_one_as_release_seq_cst_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_wavefront_one_as_release_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -16521,20 +16521,20 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_seq_cst_cmpxchg( ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_wavefront_one_as_acq_rel_seq_cst_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_wavefront_one_as_acq_rel_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -16755,20 +16755,20 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_seq_cst_cmpxchg( ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_wavefront_one_as_seq_cst_seq_cst_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_wavefront_one_as_seq_cst_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -17019,22 +17019,22 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_monotonic_ret_cmpxc ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_wavefront_one_as_monotonic_monotonic_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_wavefront_one_as_monotonic_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -17287,22 +17287,22 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_monotonic_ret_cmpxchg ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_wavefront_one_as_acquire_monotonic_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_wavefront_one_as_acquire_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -17555,22 +17555,22 @@ define amdgpu_kernel void @global_wavefront_one_as_release_monotonic_ret_cmpxchg ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_wavefront_one_as_release_monotonic_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_wavefront_one_as_release_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -17823,22 +17823,22 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -18091,22 +18091,22 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -18359,22 +18359,22 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_acquire_ret_cmpxchg ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_wavefront_one_as_monotonic_acquire_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_wavefront_one_as_monotonic_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -18627,22 +18627,22 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_wavefront_one_as_acquire_acquire_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_wavefront_one_as_acquire_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -18895,22 +18895,22 @@ define amdgpu_kernel void @global_wavefront_one_as_release_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_wavefront_one_as_release_acquire_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_wavefront_one_as_release_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -19163,22 +19163,22 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_wavefront_one_as_acq_rel_acquire_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_wavefront_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -19431,22 +19431,22 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_wavefront_one_as_seq_cst_acquire_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_wavefront_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -19699,22 +19699,22 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -19967,22 +19967,22 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_wavefront_one_as_acquire_seq_cst_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_wavefront_one_as_acquire_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -20235,22 +20235,22 @@ define amdgpu_kernel void @global_wavefront_one_as_release_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_wavefront_one_as_release_seq_cst_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_wavefront_one_as_release_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -20503,22 +20503,22 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -20771,22 +20771,22 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-workgroup.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-workgroup.ll index bbbf8cf7f5cb1..885edec03c2b6 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-workgroup.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-workgroup.ll @@ -799,18 +799,18 @@ define amdgpu_kernel void @global_workgroup_seq_cst_load( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_workgroup_seq_cst_load: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: global_load_b32 v1, v0, s[2:3] -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_workgroup_seq_cst_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { entry: %val = load atomic i32, ptr addrspace(1) %in syncscope("workgroup") seq_cst, align 4 @@ -969,16 +969,16 @@ define amdgpu_kernel void @global_workgroup_unordered_store( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_workgroup_unordered_store: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_workgroup_unordered_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { entry: store atomic i32 %in, ptr addrspace(1) %out syncscope("workgroup") unordered, align 4 @@ -1136,16 +1136,16 @@ define amdgpu_kernel void @global_workgroup_monotonic_store( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_workgroup_monotonic_store: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_workgroup_monotonic_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { entry: store atomic i32 %in, ptr addrspace(1) %out syncscope("workgroup") monotonic, align 4 @@ -1321,18 +1321,18 @@ define amdgpu_kernel void @global_workgroup_release_store( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_workgroup_release_store: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_workgroup_release_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { entry: store atomic i32 %in, ptr addrspace(1) %out syncscope("workgroup") release, align 4 @@ -1508,18 +1508,18 @@ define amdgpu_kernel void @global_workgroup_seq_cst_store( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_workgroup_seq_cst_store: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_workgroup_seq_cst_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { entry: store atomic i32 %in, ptr addrspace(1) %out syncscope("workgroup") seq_cst, align 4 @@ -1675,16 +1675,16 @@ define amdgpu_kernel void @global_workgroup_monotonic_atomicrmw( ; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_workgroup_monotonic_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_workgroup_monotonic_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("workgroup") monotonic @@ -1850,17 +1850,17 @@ define amdgpu_kernel void @global_workgroup_acquire_atomicrmw( ; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_workgroup_acquire_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_workgroup_acquire_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("workgroup") acquire @@ -2034,18 +2034,18 @@ define amdgpu_kernel void @global_workgroup_release_atomicrmw( ; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_workgroup_release_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_workgroup_release_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("workgroup") release @@ -2229,19 +2229,19 @@ define amdgpu_kernel void @global_workgroup_acq_rel_atomicrmw( ; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_workgroup_acq_rel_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_workgroup_acq_rel_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("workgroup") acq_rel @@ -2425,19 +2425,19 @@ define amdgpu_kernel void @global_workgroup_seq_cst_atomicrmw( ; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_workgroup_seq_cst_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_workgroup_seq_cst_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("workgroup") seq_cst @@ -2626,18 +2626,18 @@ define amdgpu_kernel void @global_workgroup_acquire_ret_atomicrmw( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_workgroup_acquire_ret_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_workgroup_acquire_ret_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("workgroup") acquire @@ -2847,20 +2847,20 @@ define amdgpu_kernel void @global_workgroup_acq_rel_ret_atomicrmw( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_workgroup_acq_rel_ret_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_workgroup_acq_rel_ret_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("workgroup") acq_rel @@ -3070,20 +3070,20 @@ define amdgpu_kernel void @global_workgroup_seq_cst_ret_atomicrmw( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_workgroup_seq_cst_ret_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_workgroup_seq_cst_ret_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("workgroup") seq_cst @@ -3304,20 +3304,20 @@ define amdgpu_kernel void @global_workgroup_monotonic_monotonic_cmpxchg( ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_workgroup_monotonic_monotonic_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_workgroup_monotonic_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -3548,21 +3548,21 @@ define amdgpu_kernel void @global_workgroup_acquire_monotonic_cmpxchg( ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_workgroup_acquire_monotonic_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_workgroup_acquire_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -3801,22 +3801,22 @@ define amdgpu_kernel void @global_workgroup_release_monotonic_cmpxchg( ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_workgroup_release_monotonic_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_workgroup_release_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -4065,23 +4065,23 @@ define amdgpu_kernel void @global_workgroup_acq_rel_monotonic_cmpxchg( ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_workgroup_acq_rel_monotonic_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_workgroup_acq_rel_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -4330,23 +4330,23 @@ define amdgpu_kernel void @global_workgroup_seq_cst_monotonic_cmpxchg( ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_workgroup_seq_cst_monotonic_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_workgroup_seq_cst_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -4577,21 +4577,21 @@ define amdgpu_kernel void @global_workgroup_monotonic_acquire_cmpxchg( ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_workgroup_monotonic_acquire_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_workgroup_monotonic_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -4822,21 +4822,21 @@ define amdgpu_kernel void @global_workgroup_acquire_acquire_cmpxchg( ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_workgroup_acquire_acquire_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_workgroup_acquire_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -5085,23 +5085,23 @@ define amdgpu_kernel void @global_workgroup_release_acquire_cmpxchg( ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_workgroup_release_acquire_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_workgroup_release_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -5350,23 +5350,23 @@ define amdgpu_kernel void @global_workgroup_acq_rel_acquire_cmpxchg( ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_workgroup_acq_rel_acquire_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_workgroup_acq_rel_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -5615,23 +5615,23 @@ define amdgpu_kernel void @global_workgroup_seq_cst_acquire_cmpxchg( ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_workgroup_seq_cst_acquire_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_workgroup_seq_cst_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -5880,23 +5880,23 @@ define amdgpu_kernel void @global_workgroup_monotonic_seq_cst_cmpxchg( ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_workgroup_monotonic_seq_cst_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_workgroup_monotonic_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -6145,23 +6145,23 @@ define amdgpu_kernel void @global_workgroup_acquire_seq_cst_cmpxchg( ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_workgroup_acquire_seq_cst_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_workgroup_acquire_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -6410,23 +6410,23 @@ define amdgpu_kernel void @global_workgroup_release_seq_cst_cmpxchg( ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_workgroup_release_seq_cst_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_workgroup_release_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -6675,23 +6675,23 @@ define amdgpu_kernel void @global_workgroup_acq_rel_seq_cst_cmpxchg( ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_workgroup_acq_rel_seq_cst_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_workgroup_acq_rel_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -6940,23 +6940,23 @@ define amdgpu_kernel void @global_workgroup_seq_cst_seq_cst_cmpxchg( ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_workgroup_seq_cst_seq_cst_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_workgroup_seq_cst_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -7207,22 +7207,22 @@ define amdgpu_kernel void @global_workgroup_monotonic_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_workgroup_monotonic_monotonic_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_workgroup_monotonic_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -7480,22 +7480,22 @@ define amdgpu_kernel void @global_workgroup_acquire_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_workgroup_acquire_monotonic_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_workgroup_acquire_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -7766,24 +7766,24 @@ define amdgpu_kernel void @global_workgroup_release_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_workgroup_release_monotonic_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_workgroup_release_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -8061,24 +8061,24 @@ define amdgpu_kernel void @global_workgroup_acq_rel_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_workgroup_acq_rel_monotonic_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_workgroup_acq_rel_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -8356,24 +8356,24 @@ define amdgpu_kernel void @global_workgroup_seq_cst_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_workgroup_seq_cst_monotonic_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_workgroup_seq_cst_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -8633,22 +8633,22 @@ define amdgpu_kernel void @global_workgroup_monotonic_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_workgroup_monotonic_acquire_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_workgroup_monotonic_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -8906,22 +8906,22 @@ define amdgpu_kernel void @global_workgroup_acquire_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_workgroup_acquire_acquire_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_workgroup_acquire_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -9199,24 +9199,24 @@ define amdgpu_kernel void @global_workgroup_release_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_workgroup_release_acquire_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_workgroup_release_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -9494,24 +9494,24 @@ define amdgpu_kernel void @global_workgroup_acq_rel_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_workgroup_acq_rel_acquire_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_workgroup_acq_rel_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -9789,24 +9789,24 @@ define amdgpu_kernel void @global_workgroup_seq_cst_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_workgroup_seq_cst_acquire_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_workgroup_seq_cst_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -10084,24 +10084,24 @@ define amdgpu_kernel void @global_workgroup_monotonic_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_workgroup_monotonic_seq_cst_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_workgroup_monotonic_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -10377,24 +10377,24 @@ define amdgpu_kernel void @global_workgroup_acquire_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_workgroup_acquire_seq_cst_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_workgroup_acquire_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -10672,24 +10672,24 @@ define amdgpu_kernel void @global_workgroup_release_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_workgroup_release_seq_cst_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_workgroup_release_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -10967,24 +10967,24 @@ define amdgpu_kernel void @global_workgroup_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_workgroup_acq_rel_seq_cst_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_workgroup_acq_rel_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -11262,24 +11262,24 @@ define amdgpu_kernel void @global_workgroup_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_workgroup_seq_cst_seq_cst_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_workgroup_seq_cst_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -12070,18 +12070,18 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_load( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_workgroup_one_as_seq_cst_load: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: global_load_b32 v1, v0, s[2:3] -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_workgroup_one_as_seq_cst_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { entry: %val = load atomic i32, ptr addrspace(1) %in syncscope("workgroup-one-as") seq_cst, align 4 @@ -12240,16 +12240,16 @@ define amdgpu_kernel void @global_workgroup_one_as_unordered_store( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_workgroup_one_as_unordered_store: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_workgroup_one_as_unordered_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { entry: store atomic i32 %in, ptr addrspace(1) %out syncscope("workgroup-one-as") unordered, align 4 @@ -12407,16 +12407,16 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_store( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_workgroup_one_as_monotonic_store: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_workgroup_one_as_monotonic_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { entry: store atomic i32 %in, ptr addrspace(1) %out syncscope("workgroup-one-as") monotonic, align 4 @@ -12584,18 +12584,18 @@ define amdgpu_kernel void @global_workgroup_one_as_release_store( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_workgroup_one_as_release_store: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_workgroup_one_as_release_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { entry: store atomic i32 %in, ptr addrspace(1) %out syncscope("workgroup-one-as") release, align 4 @@ -12763,18 +12763,18 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_store( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_workgroup_one_as_seq_cst_store: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_workgroup_one_as_seq_cst_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { entry: store atomic i32 %in, ptr addrspace(1) %out syncscope("workgroup-one-as") seq_cst, align 4 @@ -12930,16 +12930,16 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_atomicrmw( ; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_workgroup_one_as_monotonic_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_workgroup_one_as_monotonic_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("workgroup-one-as") monotonic @@ -13105,17 +13105,17 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_atomicrmw( ; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_workgroup_one_as_acquire_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_workgroup_one_as_acquire_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("workgroup-one-as") acquire @@ -13281,18 +13281,18 @@ define amdgpu_kernel void @global_workgroup_one_as_release_atomicrmw( ; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_workgroup_one_as_release_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_workgroup_one_as_release_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("workgroup-one-as") release @@ -13468,19 +13468,19 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_atomicrmw( ; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_workgroup_one_as_acq_rel_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_workgroup_one_as_acq_rel_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("workgroup-one-as") acq_rel @@ -13656,19 +13656,19 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_atomicrmw( ; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_workgroup_one_as_seq_cst_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_workgroup_one_as_seq_cst_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("workgroup-one-as") seq_cst @@ -13857,18 +13857,18 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_ret_atomicrmw( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_workgroup_one_as_acquire_ret_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_workgroup_one_as_acquire_ret_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("workgroup-one-as") acquire @@ -14070,20 +14070,20 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_ret_atomicrmw( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_workgroup_one_as_acq_rel_ret_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_workgroup_one_as_acq_rel_ret_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("workgroup-one-as") acq_rel @@ -14285,20 +14285,20 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_ret_atomicrmw( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_workgroup_one_as_seq_cst_ret_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_workgroup_one_as_seq_cst_ret_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("workgroup-one-as") seq_cst @@ -14519,20 +14519,20 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_monotonic_cmpxchg( ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_workgroup_one_as_monotonic_monotonic_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_workgroup_one_as_monotonic_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -14763,21 +14763,21 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_monotonic_cmpxchg( ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_workgroup_one_as_acquire_monotonic_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_workgroup_one_as_acquire_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -15008,22 +15008,22 @@ define amdgpu_kernel void @global_workgroup_one_as_release_monotonic_cmpxchg( ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_workgroup_one_as_release_monotonic_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_workgroup_one_as_release_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -15264,23 +15264,23 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_monotonic_cmpxchg( ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_workgroup_one_as_acq_rel_monotonic_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_workgroup_one_as_acq_rel_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -15521,23 +15521,23 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_monotonic_cmpxchg( ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_workgroup_one_as_seq_cst_monotonic_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_workgroup_one_as_seq_cst_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -15768,21 +15768,21 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_acquire_cmpxchg( ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_workgroup_one_as_monotonic_acquire_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_workgroup_one_as_monotonic_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -16013,21 +16013,21 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_acquire_cmpxchg( ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_workgroup_one_as_acquire_acquire_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_workgroup_one_as_acquire_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -16268,23 +16268,23 @@ define amdgpu_kernel void @global_workgroup_one_as_release_acquire_cmpxchg( ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_workgroup_one_as_release_acquire_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_workgroup_one_as_release_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -16525,23 +16525,23 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_acquire_cmpxchg( ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_workgroup_one_as_acq_rel_acquire_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_workgroup_one_as_acq_rel_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -16782,23 +16782,23 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_acquire_cmpxchg( ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_workgroup_one_as_seq_cst_acquire_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_workgroup_one_as_seq_cst_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -17039,23 +17039,23 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_seq_cst_cmpxchg( ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_workgroup_one_as_monotonic_seq_cst_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_workgroup_one_as_monotonic_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -17296,23 +17296,23 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_seq_cst_cmpxchg( ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_workgroup_one_as_acquire_seq_cst_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_workgroup_one_as_acquire_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -17553,23 +17553,23 @@ define amdgpu_kernel void @global_workgroup_one_as_release_seq_cst_cmpxchg( ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_workgroup_one_as_release_seq_cst_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_workgroup_one_as_release_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -17810,23 +17810,23 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_seq_cst_cmpxchg( ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_workgroup_one_as_acq_rel_seq_cst_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_workgroup_one_as_acq_rel_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -18067,23 +18067,23 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_seq_cst_cmpxchg( ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_workgroup_one_as_seq_cst_seq_cst_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_workgroup_one_as_seq_cst_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -18334,22 +18334,22 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_monotonic_ret_cmpxc ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_workgroup_one_as_monotonic_monotonic_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_workgroup_one_as_monotonic_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -18607,22 +18607,22 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_monotonic_ret_cmpxchg ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_workgroup_one_as_acquire_monotonic_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_workgroup_one_as_acquire_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -18885,24 +18885,24 @@ define amdgpu_kernel void @global_workgroup_one_as_release_monotonic_ret_cmpxchg ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_workgroup_one_as_release_monotonic_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_workgroup_one_as_release_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -19172,24 +19172,24 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -19459,24 +19459,24 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -19736,22 +19736,22 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_acquire_ret_cmpxchg ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_workgroup_one_as_monotonic_acquire_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_workgroup_one_as_monotonic_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -20009,22 +20009,22 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_workgroup_one_as_acquire_acquire_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_workgroup_one_as_acquire_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -20294,24 +20294,24 @@ define amdgpu_kernel void @global_workgroup_one_as_release_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_workgroup_one_as_release_acquire_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_workgroup_one_as_release_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -20581,24 +20581,24 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_workgroup_one_as_acq_rel_acquire_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_workgroup_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -20868,24 +20868,24 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_workgroup_one_as_seq_cst_acquire_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_workgroup_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -21155,24 +21155,24 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -21440,24 +21440,24 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_workgroup_one_as_acquire_seq_cst_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_workgroup_one_as_acquire_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -21727,24 +21727,24 @@ define amdgpu_kernel void @global_workgroup_one_as_release_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_workgroup_one_as_release_seq_cst_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_workgroup_one_as_release_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -22014,24 +22014,24 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -22301,24 +22301,24 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-agent.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-agent.ll index 7428ddc780675..986b48b60a443 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-agent.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-agent.ll @@ -756,19 +756,19 @@ define amdgpu_kernel void @local_agent_seq_cst_load( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_agent_seq_cst_load: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: ds_load_b32 v1, v0 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_agent_seq_cst_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: ds_load_b32 v1, v0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %in, ptr addrspace(3) %out) { entry: %val = load atomic i32, ptr addrspace(3) %in syncscope("agent") seq_cst, align 4 @@ -1229,17 +1229,17 @@ define amdgpu_kernel void @local_agent_release_store( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_agent_release_store: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_agent_release_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(3) %out) { entry: store atomic i32 %in, ptr addrspace(3) %out syncscope("agent") release, align 4 @@ -1399,17 +1399,17 @@ define amdgpu_kernel void @local_agent_seq_cst_store( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_agent_seq_cst_store: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_agent_seq_cst_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(3) %out) { entry: store atomic i32 %in, ptr addrspace(3) %out syncscope("agent") seq_cst, align 4 @@ -1886,17 +1886,17 @@ define amdgpu_kernel void @local_agent_release_atomicrmw( ; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_agent_release_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_agent_release_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("agent") release @@ -2072,18 +2072,18 @@ define amdgpu_kernel void @local_agent_acq_rel_atomicrmw( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_agent_acq_rel_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_agent_acq_rel_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("agent") acq_rel @@ -2259,18 +2259,18 @@ define amdgpu_kernel void @local_agent_seq_cst_atomicrmw( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_agent_seq_cst_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_agent_seq_cst_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("agent") seq_cst @@ -2678,20 +2678,20 @@ define amdgpu_kernel void @local_agent_acq_rel_ret_atomicrmw( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_agent_acq_rel_ret_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_agent_acq_rel_ret_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("agent") acq_rel @@ -2899,20 +2899,20 @@ define amdgpu_kernel void @local_agent_seq_cst_ret_atomicrmw( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_agent_seq_cst_ret_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_agent_seq_cst_ret_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("agent") seq_cst @@ -3474,19 +3474,19 @@ define amdgpu_kernel void @local_agent_release_monotonic_cmpxchg( ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_agent_release_monotonic_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_agent_release_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -3689,20 +3689,20 @@ define amdgpu_kernel void @local_agent_acq_rel_monotonic_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_agent_acq_rel_monotonic_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_agent_acq_rel_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -3905,20 +3905,20 @@ define amdgpu_kernel void @local_agent_seq_cst_monotonic_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_agent_seq_cst_monotonic_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_agent_seq_cst_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -4513,20 +4513,20 @@ define amdgpu_kernel void @local_agent_release_acquire_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_agent_release_acquire_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_agent_release_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -4729,20 +4729,20 @@ define amdgpu_kernel void @local_agent_acq_rel_acquire_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_agent_acq_rel_acquire_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_agent_acq_rel_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -4945,20 +4945,20 @@ define amdgpu_kernel void @local_agent_seq_cst_acquire_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_agent_seq_cst_acquire_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_agent_seq_cst_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -5161,20 +5161,20 @@ define amdgpu_kernel void @local_agent_monotonic_seq_cst_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_agent_monotonic_seq_cst_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_agent_monotonic_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -5377,20 +5377,20 @@ define amdgpu_kernel void @local_agent_acquire_seq_cst_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_agent_acquire_seq_cst_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_agent_acquire_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -5593,20 +5593,20 @@ define amdgpu_kernel void @local_agent_release_seq_cst_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_agent_release_seq_cst_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_agent_release_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -5809,20 +5809,20 @@ define amdgpu_kernel void @local_agent_acq_rel_seq_cst_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_agent_acq_rel_seq_cst_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_agent_acq_rel_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -6025,20 +6025,20 @@ define amdgpu_kernel void @local_agent_seq_cst_seq_cst_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_agent_seq_cst_seq_cst_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_agent_seq_cst_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -6724,22 +6724,22 @@ define amdgpu_kernel void @local_agent_release_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_agent_release_monotonic_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_agent_release_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -6975,22 +6975,22 @@ define amdgpu_kernel void @local_agent_acq_rel_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_agent_acq_rel_monotonic_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_agent_acq_rel_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -7226,22 +7226,22 @@ define amdgpu_kernel void @local_agent_seq_cst_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_agent_seq_cst_monotonic_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_agent_seq_cst_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -7939,22 +7939,22 @@ define amdgpu_kernel void @local_agent_release_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_agent_release_acquire_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_agent_release_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -8190,22 +8190,22 @@ define amdgpu_kernel void @local_agent_acq_rel_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_agent_acq_rel_acquire_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_agent_acq_rel_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -8441,22 +8441,22 @@ define amdgpu_kernel void @local_agent_seq_cst_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_agent_seq_cst_acquire_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_agent_seq_cst_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -8692,22 +8692,22 @@ define amdgpu_kernel void @local_agent_monotonic_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_agent_monotonic_seq_cst_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_agent_monotonic_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -8943,22 +8943,22 @@ define amdgpu_kernel void @local_agent_acquire_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_agent_acquire_seq_cst_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_agent_acquire_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -9194,22 +9194,22 @@ define amdgpu_kernel void @local_agent_release_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_agent_release_seq_cst_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_agent_release_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -9445,22 +9445,22 @@ define amdgpu_kernel void @local_agent_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_agent_acq_rel_seq_cst_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_agent_acq_rel_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -9696,22 +9696,22 @@ define amdgpu_kernel void @local_agent_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_agent_seq_cst_seq_cst_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_agent_seq_cst_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-system.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-system.ll index d57736ba0230c..81bbe0a78203e 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-system.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-system.ll @@ -756,19 +756,19 @@ define amdgpu_kernel void @local_system_seq_cst_load( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_system_seq_cst_load: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: ds_load_b32 v1, v0 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_system_seq_cst_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: ds_load_b32 v1, v0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %in, ptr addrspace(3) %out) { entry: %val = load atomic i32, ptr addrspace(3) %in seq_cst, align 4 @@ -1229,17 +1229,17 @@ define amdgpu_kernel void @local_system_release_store( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_system_release_store: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_system_release_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(3) %out) { entry: store atomic i32 %in, ptr addrspace(3) %out release, align 4 @@ -1399,17 +1399,17 @@ define amdgpu_kernel void @local_system_seq_cst_store( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_system_seq_cst_store: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_system_seq_cst_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(3) %out) { entry: store atomic i32 %in, ptr addrspace(3) %out seq_cst, align 4 @@ -1886,17 +1886,17 @@ define amdgpu_kernel void @local_system_release_atomicrmw( ; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_system_release_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_system_release_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in release @@ -2072,18 +2072,18 @@ define amdgpu_kernel void @local_system_acq_rel_atomicrmw( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_system_acq_rel_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_system_acq_rel_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in acq_rel @@ -2259,18 +2259,18 @@ define amdgpu_kernel void @local_system_seq_cst_atomicrmw( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_system_seq_cst_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_system_seq_cst_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in seq_cst @@ -2678,20 +2678,20 @@ define amdgpu_kernel void @local_system_acq_rel_ret_atomicrmw( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_system_acq_rel_ret_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_system_acq_rel_ret_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in acq_rel @@ -2899,20 +2899,20 @@ define amdgpu_kernel void @local_system_seq_cst_ret_atomicrmw( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_system_seq_cst_ret_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_system_seq_cst_ret_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in seq_cst @@ -3474,19 +3474,19 @@ define amdgpu_kernel void @local_system_release_monotonic_cmpxchg( ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_system_release_monotonic_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_system_release_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -3689,20 +3689,20 @@ define amdgpu_kernel void @local_system_acq_rel_monotonic_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_system_acq_rel_monotonic_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_system_acq_rel_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -3905,20 +3905,20 @@ define amdgpu_kernel void @local_system_seq_cst_monotonic_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_system_seq_cst_monotonic_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_system_seq_cst_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -4513,20 +4513,20 @@ define amdgpu_kernel void @local_system_release_acquire_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_system_release_acquire_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_system_release_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -4729,20 +4729,20 @@ define amdgpu_kernel void @local_system_acq_rel_acquire_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_system_acq_rel_acquire_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_system_acq_rel_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -4945,20 +4945,20 @@ define amdgpu_kernel void @local_system_seq_cst_acquire_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_system_seq_cst_acquire_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_system_seq_cst_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -5161,20 +5161,20 @@ define amdgpu_kernel void @local_system_monotonic_seq_cst_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_system_monotonic_seq_cst_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_system_monotonic_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -5377,20 +5377,20 @@ define amdgpu_kernel void @local_system_acquire_seq_cst_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_system_acquire_seq_cst_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_system_acquire_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -5593,20 +5593,20 @@ define amdgpu_kernel void @local_system_release_seq_cst_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_system_release_seq_cst_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_system_release_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -5809,20 +5809,20 @@ define amdgpu_kernel void @local_system_acq_rel_seq_cst_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_system_acq_rel_seq_cst_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_system_acq_rel_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -6025,20 +6025,20 @@ define amdgpu_kernel void @local_system_seq_cst_seq_cst_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_system_seq_cst_seq_cst_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_system_seq_cst_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -6724,22 +6724,22 @@ define amdgpu_kernel void @local_system_release_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_system_release_monotonic_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_system_release_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -6975,22 +6975,22 @@ define amdgpu_kernel void @local_system_acq_rel_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_system_acq_rel_monotonic_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_system_acq_rel_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -7226,22 +7226,22 @@ define amdgpu_kernel void @local_system_seq_cst_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_system_seq_cst_monotonic_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_system_seq_cst_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -7939,22 +7939,22 @@ define amdgpu_kernel void @local_system_release_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_system_release_acquire_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_system_release_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -8190,22 +8190,22 @@ define amdgpu_kernel void @local_system_acq_rel_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_system_acq_rel_acquire_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_system_acq_rel_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -8441,22 +8441,22 @@ define amdgpu_kernel void @local_system_seq_cst_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_system_seq_cst_acquire_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_system_seq_cst_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -8692,22 +8692,22 @@ define amdgpu_kernel void @local_system_monotonic_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_system_monotonic_seq_cst_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_system_monotonic_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -8943,22 +8943,22 @@ define amdgpu_kernel void @local_system_acquire_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_system_acquire_seq_cst_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_system_acquire_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -9194,22 +9194,22 @@ define amdgpu_kernel void @local_system_release_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_system_release_seq_cst_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_system_release_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -9445,22 +9445,22 @@ define amdgpu_kernel void @local_system_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_system_acq_rel_seq_cst_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_system_acq_rel_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -9696,22 +9696,22 @@ define amdgpu_kernel void @local_system_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_system_seq_cst_seq_cst_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_system_seq_cst_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll index d8ba02adf4b35..980141a87ecf3 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll @@ -883,17 +883,17 @@ define amdgpu_kernel void @local_volatile_workgroup_release_store( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_volatile_workgroup_release_store: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_volatile_workgroup_release_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(3) %out) { entry: store atomic volatile i32 %in, ptr addrspace(3) %out syncscope("workgroup") release, align 4 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-workgroup.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-workgroup.ll index 7220c071bf657..6a233a2c9013b 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-workgroup.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-workgroup.ll @@ -756,19 +756,19 @@ define amdgpu_kernel void @local_workgroup_seq_cst_load( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_workgroup_seq_cst_load: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: ds_load_b32 v1, v0 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_workgroup_seq_cst_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: ds_load_b32 v1, v0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %in, ptr addrspace(3) %out) { entry: %val = load atomic i32, ptr addrspace(3) %in syncscope("workgroup") seq_cst, align 4 @@ -1229,17 +1229,17 @@ define amdgpu_kernel void @local_workgroup_release_store( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_workgroup_release_store: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_workgroup_release_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(3) %out) { entry: store atomic i32 %in, ptr addrspace(3) %out syncscope("workgroup") release, align 4 @@ -1399,17 +1399,17 @@ define amdgpu_kernel void @local_workgroup_seq_cst_store( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_workgroup_seq_cst_store: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_workgroup_seq_cst_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(3) %out) { entry: store atomic i32 %in, ptr addrspace(3) %out syncscope("workgroup") seq_cst, align 4 @@ -1886,17 +1886,17 @@ define amdgpu_kernel void @local_workgroup_release_atomicrmw( ; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_workgroup_release_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_workgroup_release_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("workgroup") release @@ -2072,18 +2072,18 @@ define amdgpu_kernel void @local_workgroup_acq_rel_atomicrmw( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_workgroup_acq_rel_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_workgroup_acq_rel_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("workgroup") acq_rel @@ -2259,18 +2259,18 @@ define amdgpu_kernel void @local_workgroup_seq_cst_atomicrmw( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_workgroup_seq_cst_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_workgroup_seq_cst_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("workgroup") seq_cst @@ -2678,20 +2678,20 @@ define amdgpu_kernel void @local_workgroup_acq_rel_ret_atomicrmw( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_workgroup_acq_rel_ret_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_workgroup_acq_rel_ret_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("workgroup") acq_rel @@ -2899,20 +2899,20 @@ define amdgpu_kernel void @local_workgroup_seq_cst_ret_atomicrmw( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_workgroup_seq_cst_ret_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_workgroup_seq_cst_ret_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("workgroup") seq_cst @@ -3474,19 +3474,19 @@ define amdgpu_kernel void @local_workgroup_release_monotonic_cmpxchg( ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_workgroup_release_monotonic_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_workgroup_release_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -3689,20 +3689,20 @@ define amdgpu_kernel void @local_workgroup_acq_rel_monotonic_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_workgroup_acq_rel_monotonic_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_workgroup_acq_rel_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -3905,20 +3905,20 @@ define amdgpu_kernel void @local_workgroup_seq_cst_monotonic_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_workgroup_seq_cst_monotonic_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_workgroup_seq_cst_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -4513,20 +4513,20 @@ define amdgpu_kernel void @local_workgroup_release_acquire_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_workgroup_release_acquire_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_workgroup_release_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -4729,20 +4729,20 @@ define amdgpu_kernel void @local_workgroup_acq_rel_acquire_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_workgroup_acq_rel_acquire_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_workgroup_acq_rel_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -4945,20 +4945,20 @@ define amdgpu_kernel void @local_workgroup_seq_cst_acquire_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_workgroup_seq_cst_acquire_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_workgroup_seq_cst_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -5161,20 +5161,20 @@ define amdgpu_kernel void @local_workgroup_monotonic_seq_cst_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_workgroup_monotonic_seq_cst_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_workgroup_monotonic_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -5377,20 +5377,20 @@ define amdgpu_kernel void @local_workgroup_acquire_seq_cst_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_workgroup_acquire_seq_cst_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_workgroup_acquire_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -5593,20 +5593,20 @@ define amdgpu_kernel void @local_workgroup_release_seq_cst_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_workgroup_release_seq_cst_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_workgroup_release_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -5809,20 +5809,20 @@ define amdgpu_kernel void @local_workgroup_acq_rel_seq_cst_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_workgroup_acq_rel_seq_cst_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_workgroup_acq_rel_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -6025,20 +6025,20 @@ define amdgpu_kernel void @local_workgroup_seq_cst_seq_cst_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_workgroup_seq_cst_seq_cst_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_workgroup_seq_cst_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -6724,22 +6724,22 @@ define amdgpu_kernel void @local_workgroup_release_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_workgroup_release_monotonic_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_workgroup_release_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -6975,22 +6975,22 @@ define amdgpu_kernel void @local_workgroup_acq_rel_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_workgroup_acq_rel_monotonic_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_workgroup_acq_rel_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -7226,22 +7226,22 @@ define amdgpu_kernel void @local_workgroup_seq_cst_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_workgroup_seq_cst_monotonic_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_workgroup_seq_cst_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -7939,22 +7939,22 @@ define amdgpu_kernel void @local_workgroup_release_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_workgroup_release_acquire_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_workgroup_release_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -8190,22 +8190,22 @@ define amdgpu_kernel void @local_workgroup_acq_rel_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_workgroup_acq_rel_acquire_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_workgroup_acq_rel_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -8441,22 +8441,22 @@ define amdgpu_kernel void @local_workgroup_seq_cst_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_workgroup_seq_cst_acquire_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_workgroup_seq_cst_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -8692,22 +8692,22 @@ define amdgpu_kernel void @local_workgroup_monotonic_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_workgroup_monotonic_seq_cst_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_workgroup_monotonic_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -8943,22 +8943,22 @@ define amdgpu_kernel void @local_workgroup_acquire_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_workgroup_acquire_seq_cst_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_workgroup_acquire_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -9194,22 +9194,22 @@ define amdgpu_kernel void @local_workgroup_release_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_workgroup_release_seq_cst_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_workgroup_release_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -9445,22 +9445,22 @@ define amdgpu_kernel void @local_workgroup_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_workgroup_acq_rel_seq_cst_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_workgroup_acq_rel_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -9696,22 +9696,22 @@ define amdgpu_kernel void @local_workgroup_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_workgroup_seq_cst_seq_cst_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_workgroup_seq_cst_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 From c6180d95965e41a440b7e6b92e0e8890000091b8 Mon Sep 17 00:00:00 2001 From: pvanhout Date: Fri, 22 Aug 2025 10:12:03 +0200 Subject: [PATCH 4/5] Comments --- llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp | 7 ++++--- llvm/lib/Target/AMDGPU/SOPInstructions.td | 2 +- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp index 0451b27bc81c5..ac6ad085f014e 100644 --- a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp +++ b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp @@ -609,7 +609,7 @@ class SIGfx12CacheControl : public SIGfx11CacheControl { SIGfx12CacheControl(const GCNSubtarget &ST) : SIGfx11CacheControl(ST) { // GFX12.0 and GFX12.5 memory models greatly overlap, and in some cases // the behavior is the same if assuming GFX12.0 in CU mode. - assert(ST.hasGFX1250Insts() ? ST.isCuModeEnabled() : true); + assert(!ST.hasGFX1250Insts() || ST.isCuModeEnabled()); } bool insertWait(MachineBasicBlock::iterator &MI, SIAtomicScope Scope, @@ -2630,14 +2630,15 @@ bool SIGfx12CacheControl::finalizeStore(MachineInstr &MI, bool Atomic) const { const bool IsRMW = (MI.mayLoad() && MI.mayStore()); bool Changed = false; - // GFX12.5 only: xcnt wait is needed before flat and global atomics stores/rmw + // GFX12.5 only: xcnt wait is needed before flat and global atomics + // stores/rmw. if (Atomic && ST.requiresWaitXCntBeforeAtomicStores() && TII->isFLAT(MI)) { MachineBasicBlock &MBB = *MI.getParent(); BuildMI(MBB, MI, MI.getDebugLoc(), TII->get(S_WAIT_XCNT_soft)).addImm(0); Changed = true; } - // Remaining fixes do not apply to RMWs + // Remaining fixes do not apply to RMWs. if (IsRMW) return Changed; diff --git a/llvm/lib/Target/AMDGPU/SOPInstructions.td b/llvm/lib/Target/AMDGPU/SOPInstructions.td index e170268b47c44..12a27db241c4e 100644 --- a/llvm/lib/Target/AMDGPU/SOPInstructions.td +++ b/llvm/lib/Target/AMDGPU/SOPInstructions.td @@ -1655,7 +1655,7 @@ let OtherPredicates = [HasImageInsts] in { let SubtargetPredicate = HasWaitXcnt in { - def S_WAIT_XCNT_soft : SOPP_Pseudo<"s_soft_wait_xcnt", (ins s16imm:$simm16), "$simm16">; + def S_WAIT_XCNT_soft : SOPP_Pseudo<"", (ins s16imm:$simm16), "$simm16">; } // Represents the point at which a wave must wait for all outstanding direct loads to LDS. From 9567d51d848b886011d1ef5557303988583ceb0a Mon Sep 17 00:00:00 2001 From: pvanhout Date: Tue, 9 Sep 2025 10:29:04 +0200 Subject: [PATCH 5/5] Rebase, handle barrier patch --- llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp | 3 +- .../test/CodeGen/AMDGPU/flat-saddr-atomics.ll | 108 +++--- llvm/test/CodeGen/AMDGPU/fp-atomics-gfx942.ll | 4 + .../llvm.amdgcn.cooperative.atomic-agent.ll | 12 + ...lvm.amdgcn.cooperative.atomic-workgroup.ll | 18 + .../AMDGPU/memory-legalizer-barriers.ll | 9 +- .../AMDGPU/memory-legalizer-private-agent.ll | 310 ++++++++---------- .../memory-legalizer-private-singlethread.ll | 84 +++++ .../AMDGPU/memory-legalizer-private-system.ll | 241 +++++--------- .../memory-legalizer-private-wavefront.ll | 84 +++++ .../memory-legalizer-private-workgroup.ll | 282 +++++++++++++--- 11 files changed, 731 insertions(+), 424 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp index ac6ad085f014e..c20fcacb8fb26 100644 --- a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp +++ b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp @@ -2202,7 +2202,8 @@ bool SIGfx10CacheControl::insertBarrierStart( // mode. This is because a CU mode release fence does not emit any wait, which // is fine when only dealing with vmem, but isn't sufficient in the presence // of barriers which do not go through vmem. - if (!ST.isCuModeEnabled()) + // GFX12.5 does not require this additional wait. + if (!ST.isCuModeEnabled() || ST.hasGFX1250Insts()) return false; BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), diff --git a/llvm/test/CodeGen/AMDGPU/flat-saddr-atomics.ll b/llvm/test/CodeGen/AMDGPU/flat-saddr-atomics.ll index 62129ebe40358..1bf37d512f845 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-saddr-atomics.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-saddr-atomics.ll @@ -11859,7 +11859,9 @@ define double @flat_atomic_fadd_f64_saddr_rtn(ptr inreg %ptr, double %data) { ; GFX1250-SDAG-NEXT: s_cbranch_vccz .LBB110_4 ; GFX1250-SDAG-NEXT: ; %bb.2: ; %atomicrmw.global ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0 ; GFX1250-SDAG-NEXT: global_atomic_add_f64 v[2:3], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN +; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB110_5 ; GFX1250-SDAG-NEXT: s_branch .LBB110_6 ; GFX1250-SDAG-NEXT: .LBB110_3: @@ -11873,7 +11875,6 @@ define double @flat_atomic_fadd_f64_saddr_rtn(ptr inreg %ptr, double %data) { ; GFX1250-SDAG-NEXT: s_sub_co_i32 s2, s0, s2 ; GFX1250-SDAG-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1250-SDAG-NEXT: s_cselect_b32 s2, s2, -1 -; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX1250-SDAG-NEXT: scratch_load_b64 v[2:3], off, s2 ; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX1250-SDAG-NEXT: v_add_f64_e32 v[4:5], v[2:3], v[0:1] @@ -11884,12 +11885,12 @@ define double @flat_atomic_fadd_f64_saddr_rtn(ptr inreg %ptr, double %data) { ; GFX1250-SDAG-NEXT: .LBB110_7: ; %atomicrmw.shared ; GFX1250-SDAG-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1250-SDAG-NEXT: s_cselect_b32 s0, s0, -1 -; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 +; GFX1250-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0 ; GFX1250-SDAG-NEXT: ds_add_rtn_f64 v[2:3], v2, v[0:1] ; GFX1250-SDAG-NEXT: s_wait_dscnt 0x0 ; GFX1250-SDAG-NEXT: .LBB110_8: ; %atomicrmw.end -; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v0, v2 :: v_dual_mov_b32 v1, v3 ; GFX1250-SDAG-NEXT: s_set_pc_i64 s[30:31] ; @@ -11915,7 +11916,9 @@ define double @flat_atomic_fadd_f64_saddr_rtn(ptr inreg %ptr, double %data) { ; GFX1250-GISEL-NEXT: ; %bb.2: ; %atomicrmw.global ; GFX1250-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX1250-GISEL-NEXT: s_mov_b32 s2, 0 +; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0 ; GFX1250-GISEL-NEXT: global_atomic_add_f64 v[2:3], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN +; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX1250-GISEL-NEXT: .LBB110_3: ; %Flow ; GFX1250-GISEL-NEXT: s_xor_b32 s2, s2, 1 ; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -11927,7 +11930,6 @@ define double @flat_atomic_fadd_f64_saddr_rtn(ptr inreg %ptr, double %data) { ; GFX1250-GISEL-NEXT: s_sub_co_i32 s2, s0, s2 ; GFX1250-GISEL-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1250-GISEL-NEXT: s_cselect_b32 s2, s2, -1 -; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX1250-GISEL-NEXT: scratch_load_b64 v[2:3], off, s2 ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX1250-GISEL-NEXT: v_add_f64_e32 v[4:5], v[2:3], v[0:1] @@ -11943,12 +11945,12 @@ define double @flat_atomic_fadd_f64_saddr_rtn(ptr inreg %ptr, double %data) { ; GFX1250-GISEL-NEXT: ; %bb.7: ; %atomicrmw.shared ; GFX1250-GISEL-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1250-GISEL-NEXT: s_cselect_b32 s0, s0, -1 -; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-GISEL-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0 ; GFX1250-GISEL-NEXT: ds_add_rtn_f64 v[2:3], v2, v[0:1] ; GFX1250-GISEL-NEXT: s_wait_dscnt 0x0 ; GFX1250-GISEL-NEXT: .LBB110_8: ; %atomicrmw.end -; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v0, v2 :: v_dual_mov_b32 v1, v3 ; GFX1250-GISEL-NEXT: s_set_pc_i64 s[30:31] ; @@ -12085,7 +12087,9 @@ define void @flat_atomic_fadd_f64_saddr_nortn(ptr inreg %ptr, double %data) { ; GFX1250-SDAG-NEXT: ; %bb.4: ; %atomicrmw.global ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX1250-SDAG-NEXT: s_mov_b32 s2, 0 +; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0 ; GFX1250-SDAG-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] +; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0 ; GFX1250-SDAG-NEXT: .LBB111_5: ; %Flow ; GFX1250-SDAG-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2 ; GFX1250-SDAG-NEXT: s_cbranch_vccnz .LBB111_7 @@ -12107,6 +12111,7 @@ define void @flat_atomic_fadd_f64_saddr_nortn(ptr inreg %ptr, double %data) { ; GFX1250-SDAG-NEXT: s_cselect_b32 s0, s0, -1 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0 ; GFX1250-SDAG-NEXT: ds_add_f64 v2, v[0:1] ; GFX1250-SDAG-NEXT: s_wait_dscnt 0x0 ; GFX1250-SDAG-NEXT: s_set_pc_i64 s[30:31] @@ -12131,7 +12136,9 @@ define void @flat_atomic_fadd_f64_saddr_nortn(ptr inreg %ptr, double %data) { ; GFX1250-GISEL-NEXT: ; %bb.2: ; %atomicrmw.global ; GFX1250-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX1250-GISEL-NEXT: s_mov_b32 s2, 0 +; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0 ; GFX1250-GISEL-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] +; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0 ; GFX1250-GISEL-NEXT: .LBB111_3: ; %Flow ; GFX1250-GISEL-NEXT: s_xor_b32 s2, s2, 1 ; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -12160,6 +12167,7 @@ define void @flat_atomic_fadd_f64_saddr_nortn(ptr inreg %ptr, double %data) { ; GFX1250-GISEL-NEXT: s_cselect_b32 s0, s0, -1 ; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-GISEL-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0 ; GFX1250-GISEL-NEXT: ds_add_f64 v2, v[0:1] ; GFX1250-GISEL-NEXT: s_wait_dscnt 0x0 ; GFX1250-GISEL-NEXT: .LBB111_8: ; %atomicrmw.phi @@ -12278,20 +12286,19 @@ define double @flat_atomic_fmax_f64_saddr_rtn(ptr inreg %ptr, double %data) { ; GFX1250-SDAG-NEXT: s_cbranch_vccz .LBB112_2 ; GFX1250-SDAG-NEXT: ; %bb.1: ; %atomicrmw.global ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0 ; GFX1250-SDAG-NEXT: flat_atomic_max_num_f64 v[2:3], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN -; GFX1250-SDAG-NEXT: s_wait_dscnt 0x0 +; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB112_3 ; GFX1250-SDAG-NEXT: s_branch .LBB112_4 ; GFX1250-SDAG-NEXT: .LBB112_2: ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX1250-SDAG-NEXT: .LBB112_3: ; %atomicrmw.private ; GFX1250-SDAG-NEXT: s_mov_b32 s2, src_flat_scratch_base_lo -; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 ; GFX1250-SDAG-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[0:1] ; GFX1250-SDAG-NEXT: s_sub_co_i32 s2, s0, s2 ; GFX1250-SDAG-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1250-SDAG-NEXT: s_cselect_b32 s0, s2, -1 -; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX1250-SDAG-NEXT: scratch_load_b64 v[2:3], off, s0 ; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX1250-SDAG-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] @@ -12299,7 +12306,7 @@ define double @flat_atomic_fmax_f64_saddr_rtn(ptr inreg %ptr, double %data) { ; GFX1250-SDAG-NEXT: v_max_num_f64_e32 v[0:1], v[4:5], v[0:1] ; GFX1250-SDAG-NEXT: scratch_store_b64 off, v[0:1], s0 scope:SCOPE_SE ; GFX1250-SDAG-NEXT: .LBB112_4: ; %atomicrmw.end -; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 +; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v0, v2 :: v_dual_mov_b32 v1, v3 ; GFX1250-SDAG-NEXT: s_set_pc_i64 s[30:31] ; @@ -12319,10 +12326,10 @@ define double @flat_atomic_fmax_f64_saddr_rtn(ptr inreg %ptr, double %data) { ; GFX1250-GISEL-NEXT: ; %bb.1: ; %atomicrmw.global ; GFX1250-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX1250-GISEL-NEXT: s_mov_b32 s4, 0 +; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0 ; GFX1250-GISEL-NEXT: flat_atomic_max_num_f64 v[2:3], v2, v[0:1], s[0:1] offset:80 th:TH_ATOMIC_RETURN -; GFX1250-GISEL-NEXT: s_wait_dscnt 0x0 +; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: .LBB112_2: ; %Flow -; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 ; GFX1250-GISEL-NEXT: s_xor_b32 s0, s4, 1 ; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-GISEL-NEXT: s_cmp_lg_u32 s0, 0 @@ -12333,7 +12340,6 @@ define double @flat_atomic_fmax_f64_saddr_rtn(ptr inreg %ptr, double %data) { ; GFX1250-GISEL-NEXT: s_sub_co_i32 s0, s2, s0 ; GFX1250-GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX1250-GISEL-NEXT: s_cselect_b32 s0, s0, -1 -; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX1250-GISEL-NEXT: scratch_load_b64 v[2:3], off, s0 ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX1250-GISEL-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] @@ -12341,7 +12347,7 @@ define double @flat_atomic_fmax_f64_saddr_rtn(ptr inreg %ptr, double %data) { ; GFX1250-GISEL-NEXT: v_max_num_f64_e32 v[0:1], v[4:5], v[0:1] ; GFX1250-GISEL-NEXT: scratch_store_b64 off, v[0:1], s0 scope:SCOPE_SE ; GFX1250-GISEL-NEXT: .LBB112_4: ; %atomicrmw.end -; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 +; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v0, v2 :: v_dual_mov_b32 v1, v3 ; GFX1250-GISEL-NEXT: s_set_pc_i64 s[30:31] ; @@ -12438,8 +12444,9 @@ define void @flat_atomic_fmax_f64_saddr_nortn(ptr inreg %ptr, double %data) { ; GFX1250-SDAG-NEXT: s_set_pc_i64 s[30:31] ; GFX1250-SDAG-NEXT: .LBB113_3: ; %atomicrmw.global ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0 ; GFX1250-SDAG-NEXT: flat_atomic_max_num_f64 v2, v[0:1], s[0:1] -; GFX1250-SDAG-NEXT: s_wait_dscnt 0x0 +; GFX1250-SDAG-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB113_2 ; GFX1250-SDAG-NEXT: .LBB113_4: ; %atomicrmw.private ; GFX1250-SDAG-NEXT: s_mov_b32 s2, src_flat_scratch_base_lo @@ -12471,8 +12478,9 @@ define void @flat_atomic_fmax_f64_saddr_nortn(ptr inreg %ptr, double %data) { ; GFX1250-GISEL-NEXT: ; %bb.1: ; %atomicrmw.global ; GFX1250-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX1250-GISEL-NEXT: s_mov_b32 s4, 0 +; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0 ; GFX1250-GISEL-NEXT: flat_atomic_max_num_f64 v2, v[0:1], s[0:1] offset:80 -; GFX1250-GISEL-NEXT: s_wait_dscnt 0x0 +; GFX1250-GISEL-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: .LBB113_2: ; %Flow ; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 ; GFX1250-GISEL-NEXT: s_xor_b32 s0, s4, 1 @@ -12579,20 +12587,19 @@ define double @flat_atomic_fmin_f64_saddr_rtn(ptr inreg %ptr, double %data) { ; GFX1250-SDAG-NEXT: s_cbranch_vccz .LBB114_2 ; GFX1250-SDAG-NEXT: ; %bb.1: ; %atomicrmw.global ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0 ; GFX1250-SDAG-NEXT: flat_atomic_min_num_f64 v[2:3], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN -; GFX1250-SDAG-NEXT: s_wait_dscnt 0x0 +; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB114_3 ; GFX1250-SDAG-NEXT: s_branch .LBB114_4 ; GFX1250-SDAG-NEXT: .LBB114_2: ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX1250-SDAG-NEXT: .LBB114_3: ; %atomicrmw.private ; GFX1250-SDAG-NEXT: s_mov_b32 s2, src_flat_scratch_base_lo -; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 ; GFX1250-SDAG-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[0:1] ; GFX1250-SDAG-NEXT: s_sub_co_i32 s2, s0, s2 ; GFX1250-SDAG-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1250-SDAG-NEXT: s_cselect_b32 s0, s2, -1 -; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX1250-SDAG-NEXT: scratch_load_b64 v[2:3], off, s0 ; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX1250-SDAG-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] @@ -12600,7 +12607,7 @@ define double @flat_atomic_fmin_f64_saddr_rtn(ptr inreg %ptr, double %data) { ; GFX1250-SDAG-NEXT: v_min_num_f64_e32 v[0:1], v[4:5], v[0:1] ; GFX1250-SDAG-NEXT: scratch_store_b64 off, v[0:1], s0 scope:SCOPE_SE ; GFX1250-SDAG-NEXT: .LBB114_4: ; %atomicrmw.end -; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 +; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v0, v2 :: v_dual_mov_b32 v1, v3 ; GFX1250-SDAG-NEXT: s_set_pc_i64 s[30:31] ; @@ -12620,10 +12627,10 @@ define double @flat_atomic_fmin_f64_saddr_rtn(ptr inreg %ptr, double %data) { ; GFX1250-GISEL-NEXT: ; %bb.1: ; %atomicrmw.global ; GFX1250-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX1250-GISEL-NEXT: s_mov_b32 s4, 0 +; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0 ; GFX1250-GISEL-NEXT: flat_atomic_min_num_f64 v[2:3], v2, v[0:1], s[0:1] offset:80 th:TH_ATOMIC_RETURN -; GFX1250-GISEL-NEXT: s_wait_dscnt 0x0 +; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: .LBB114_2: ; %Flow -; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 ; GFX1250-GISEL-NEXT: s_xor_b32 s0, s4, 1 ; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-GISEL-NEXT: s_cmp_lg_u32 s0, 0 @@ -12634,7 +12641,6 @@ define double @flat_atomic_fmin_f64_saddr_rtn(ptr inreg %ptr, double %data) { ; GFX1250-GISEL-NEXT: s_sub_co_i32 s0, s2, s0 ; GFX1250-GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX1250-GISEL-NEXT: s_cselect_b32 s0, s0, -1 -; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX1250-GISEL-NEXT: scratch_load_b64 v[2:3], off, s0 ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX1250-GISEL-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] @@ -12642,7 +12648,7 @@ define double @flat_atomic_fmin_f64_saddr_rtn(ptr inreg %ptr, double %data) { ; GFX1250-GISEL-NEXT: v_min_num_f64_e32 v[0:1], v[4:5], v[0:1] ; GFX1250-GISEL-NEXT: scratch_store_b64 off, v[0:1], s0 scope:SCOPE_SE ; GFX1250-GISEL-NEXT: .LBB114_4: ; %atomicrmw.end -; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 +; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v0, v2 :: v_dual_mov_b32 v1, v3 ; GFX1250-GISEL-NEXT: s_set_pc_i64 s[30:31] ; @@ -12739,8 +12745,9 @@ define void @flat_atomic_fmin_f64_saddr_nortn(ptr inreg %ptr, double %data) { ; GFX1250-SDAG-NEXT: s_set_pc_i64 s[30:31] ; GFX1250-SDAG-NEXT: .LBB115_3: ; %atomicrmw.global ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0 ; GFX1250-SDAG-NEXT: flat_atomic_min_num_f64 v2, v[0:1], s[0:1] -; GFX1250-SDAG-NEXT: s_wait_dscnt 0x0 +; GFX1250-SDAG-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB115_2 ; GFX1250-SDAG-NEXT: .LBB115_4: ; %atomicrmw.private ; GFX1250-SDAG-NEXT: s_mov_b32 s2, src_flat_scratch_base_lo @@ -12772,8 +12779,9 @@ define void @flat_atomic_fmin_f64_saddr_nortn(ptr inreg %ptr, double %data) { ; GFX1250-GISEL-NEXT: ; %bb.1: ; %atomicrmw.global ; GFX1250-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX1250-GISEL-NEXT: s_mov_b32 s4, 0 +; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0 ; GFX1250-GISEL-NEXT: flat_atomic_min_num_f64 v2, v[0:1], s[0:1] offset:80 -; GFX1250-GISEL-NEXT: s_wait_dscnt 0x0 +; GFX1250-GISEL-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: .LBB115_2: ; %Flow ; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 ; GFX1250-GISEL-NEXT: s_xor_b32 s0, s4, 1 @@ -12870,6 +12878,7 @@ define float @flat_atomic_fadd_f32_saddr_rtn(ptr inreg %ptr, float %data) { ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, 0 +; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: flat_atomic_add_f32 v0, v1, v0, s[0:1] offset:40 th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] @@ -12901,8 +12910,9 @@ define void @flat_atomic_fadd_f32_saddr_nortn(ptr inreg %ptr, float %data) { ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, 0 +; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: flat_atomic_add_f32 v1, v0, s[0:1] offset:40 -; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] ; ; GFX950-SDAG-LABEL: flat_atomic_fadd_f32_saddr_nortn: @@ -12932,6 +12942,7 @@ define float @flat_atomic_fmax_f32_saddr_rtn(ptr inreg %ptr, float %data) { ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, 0 +; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: flat_atomic_max_num_f32 v0, v1, v0, s[0:1] offset:40 th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] @@ -12994,8 +13005,9 @@ define void @flat_atomic_fmax_f32_saddr_nortn(ptr inreg %ptr, float %data) { ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, 0 +; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: flat_atomic_max_num_f32 v1, v0, s[0:1] offset:40 -; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] ; ; GFX950-SDAG-LABEL: flat_atomic_fmax_f32_saddr_nortn: @@ -13055,6 +13067,7 @@ define float @flat_atomic_fmin_f32_saddr_rtn(ptr inreg %ptr, float %data) { ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, 0 +; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: flat_atomic_min_num_f32 v0, v1, v0, s[0:1] offset:40 th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] @@ -13117,8 +13130,9 @@ define void @flat_atomic_fmin_f32_saddr_nortn(ptr inreg %ptr, float %data) { ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, 0 +; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: flat_atomic_min_num_f32 v1, v0, s[0:1] offset:40 -; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] ; ; GFX950-SDAG-LABEL: flat_atomic_fmin_f32_saddr_nortn: @@ -13178,6 +13192,7 @@ define <2 x half> @flat_atomic_fadd_v2f16_saddr_rtn(ptr inreg %ptr, <2 x half> % ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, 0 +; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: flat_atomic_pk_add_f16 v0, v1, v0, s[0:1] offset:40 th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] @@ -13209,8 +13224,9 @@ define void @flat_atomic_fadd_v2f16_saddr_nortn(ptr inreg %ptr, <2 x half> %data ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, 0 +; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: flat_atomic_pk_add_f16 v1, v0, s[0:1] offset:40 -; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] ; ; GFX950-SDAG-LABEL: flat_atomic_fadd_v2f16_saddr_nortn: @@ -13251,11 +13267,13 @@ define <2 x half> @flat_atomic_fmax_v2f16_saddr_rtn(ptr inreg %ptr, <2 x half> % ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-NEXT: v_pk_max_num_f16 v0, v5, v5 ; GFX1250-NEXT: v_pk_max_num_f16 v4, v0, v1 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v2, v[4:5], s[0:1] offset:40 th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v5 ; GFX1250-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 ; GFX1250-NEXT: s_cbranch_execnz .LBB124_1 ; GFX1250-NEXT: ; %bb.2: ; %atomicrmw.end @@ -13331,10 +13349,11 @@ define void @flat_atomic_fmax_v2f16_saddr_nortn(ptr inreg %ptr, <2 x half> %data ; GFX1250-NEXT: v_pk_max_num_f16 v0, v1, v1 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-NEXT: v_pk_max_num_f16 v0, v0, v3 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:40 th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 -; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, v0 ; GFX1250-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -13414,11 +13433,13 @@ define <2 x half> @flat_atomic_fmin_v2f16_saddr_rtn(ptr inreg %ptr, <2 x half> % ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-NEXT: v_pk_max_num_f16 v0, v5, v5 ; GFX1250-NEXT: v_pk_min_num_f16 v4, v0, v1 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v2, v[4:5], s[0:1] offset:40 th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v5 ; GFX1250-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 ; GFX1250-NEXT: s_cbranch_execnz .LBB126_1 ; GFX1250-NEXT: ; %bb.2: ; %atomicrmw.end @@ -13494,10 +13515,11 @@ define void @flat_atomic_fmin_v2f16_saddr_nortn(ptr inreg %ptr, <2 x half> %data ; GFX1250-NEXT: v_pk_max_num_f16 v0, v1, v1 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-NEXT: v_pk_min_num_f16 v0, v0, v3 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:40 th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 -; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, v0 ; GFX1250-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -13566,6 +13588,7 @@ define <2 x bfloat> @flat_atomic_fadd_v2bf16_saddr_rtn(ptr inreg %ptr, <2 x bflo ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, 0 +; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: flat_atomic_pk_add_bf16 v0, v1, v0, s[0:1] offset:40 th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] @@ -13589,8 +13612,9 @@ define void @flat_atomic_fadd_v2bf16_saddr_nortn(ptr inreg %ptr, <2 x bfloat> %d ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, 0 +; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: flat_atomic_pk_add_bf16 v1, v0, s[0:1] offset:40 -; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] ; ; GFX950-LABEL: flat_atomic_fadd_v2bf16_saddr_nortn: @@ -13620,11 +13644,13 @@ define <2 x bfloat> @flat_atomic_fmax_v2bf16_saddr_rtn(ptr inreg %ptr, <2 x bflo ; GFX1250-NEXT: v_mov_b32_e32 v5, v1 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-NEXT: v_pk_max_num_bf16 v4, v5, v0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v2, v[4:5], s[0:1] offset:40 th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v5 ; GFX1250-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 ; GFX1250-NEXT: s_cbranch_execnz .LBB130_1 ; GFX1250-NEXT: ; %bb.2: ; %atomicrmw.end @@ -13676,10 +13702,11 @@ define void @flat_atomic_fmax_v2bf16_saddr_nortn(ptr inreg %ptr, <2 x bfloat> %d ; GFX1250-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: v_pk_max_num_bf16 v2, v3, v0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v2, v1, v[2:3], s[0:1] offset:40 th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v3, v2 ; GFX1250-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -13734,11 +13761,13 @@ define <2 x bfloat> @flat_atomic_fmin_v2bf16_saddr_rtn(ptr inreg %ptr, <2 x bflo ; GFX1250-NEXT: v_mov_b32_e32 v5, v1 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-NEXT: v_pk_min_num_bf16 v4, v5, v0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v2, v[4:5], s[0:1] offset:40 th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v5 ; GFX1250-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 ; GFX1250-NEXT: s_cbranch_execnz .LBB132_1 ; GFX1250-NEXT: ; %bb.2: ; %atomicrmw.end @@ -13790,10 +13819,11 @@ define void @flat_atomic_fmin_v2bf16_saddr_nortn(ptr inreg %ptr, <2 x bfloat> %d ; GFX1250-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: v_pk_min_num_bf16 v2, v3, v0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v2, v1, v[2:3], s[0:1] offset:40 th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v3, v2 ; GFX1250-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) diff --git a/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx942.ll b/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx942.ll index 1e293c28ce397..ba761bedb905c 100644 --- a/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx942.ll +++ b/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx942.ll @@ -38,6 +38,7 @@ define amdgpu_kernel void @flat_atomic_fadd_f32_noret_pat(ptr %ptr) { ; GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 4.0 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: flat_atomic_add_f32 v0, v1, s[0:1] scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 @@ -79,6 +80,7 @@ define amdgpu_kernel void @flat_atomic_fadd_f32_noret_pat_ieee(ptr %ptr) #0 { ; GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 4.0 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: flat_atomic_add_f32 v0, v1, s[0:1] scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 @@ -189,6 +191,7 @@ define <2 x half> @local_atomic_fadd_v2f16_rtn(ptr addrspace(3) %ptr, <2 x half> ; GFX1250: ; %bb.0: ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: ds_pk_add_rtn_f16 v0, v0, v1 ; GFX1250-NEXT: s_wait_dscnt 0x0 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] @@ -255,6 +258,7 @@ define <2 x i16> @local_atomic_fadd_v2bf16_rtn(ptr addrspace(3) %ptr, <2 x i16> ; GFX1250: ; %bb.0: ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: ds_pk_add_rtn_bf16 v0, v0, v1 ; GFX1250-NEXT: s_wait_dscnt 0x0 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cooperative.atomic-agent.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cooperative.atomic-agent.ll index e3ec4d1f0f67a..614a221d43d53 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cooperative.atomic-agent.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cooperative.atomic-agent.ll @@ -130,6 +130,7 @@ define void @test_flat_amdgcn_cooperative_atomic_store_32x4B_release(ptr noundef ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_dscnt 0x0 @@ -144,6 +145,7 @@ define void @test_flat_amdgcn_cooperative_atomic_store_16x8B_release(ptr noundef ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: flat_store_b64 v[0:1], v[2:3] scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_dscnt 0x0 @@ -158,6 +160,7 @@ define void @test_flat_amdgcn_cooperative_atomic_store_8x16B_release(ptr noundef ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: flat_store_b128 v[0:1], v[2:5] scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_dscnt 0x0 @@ -220,6 +223,7 @@ define void @test_flat_amdgcn_cooperative_atomic_store_32x4B_seq_cst(ptr noundef ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_dscnt 0x0 @@ -234,6 +238,7 @@ define void @test_flat_amdgcn_cooperative_atomic_store_16x8B_seq_cst(ptr noundef ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: flat_store_b64 v[0:1], v[2:3] scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_dscnt 0x0 @@ -248,6 +253,7 @@ define void @test_flat_amdgcn_cooperative_atomic_store_8x16B_seq_cst(ptr noundef ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: flat_store_b128 v[0:1], v[2:5] scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_dscnt 0x0 @@ -385,6 +391,7 @@ define void @test_one_as_flat_amdgcn_cooperative_atomic_store_32x4B_release(ptr ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_dscnt 0x0 @@ -399,6 +406,7 @@ define void @test_one_as_flat_amdgcn_cooperative_atomic_store_16x8B_release(ptr ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: flat_store_b64 v[0:1], v[2:3] scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_dscnt 0x0 @@ -413,6 +421,7 @@ define void @test_one_as_flat_amdgcn_cooperative_atomic_store_8x16B_release(ptr ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: flat_store_b128 v[0:1], v[2:5] scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_dscnt 0x0 @@ -475,6 +484,7 @@ define void @test_one_as_flat_amdgcn_cooperative_atomic_store_32x4B_seq_cst(ptr ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_dscnt 0x0 @@ -489,6 +499,7 @@ define void @test_one_as_flat_amdgcn_cooperative_atomic_store_16x8B_seq_cst(ptr ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: flat_store_b64 v[0:1], v[2:3] scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_dscnt 0x0 @@ -503,6 +514,7 @@ define void @test_one_as_flat_amdgcn_cooperative_atomic_store_8x16B_seq_cst(ptr ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: flat_store_b128 v[0:1], v[2:5] scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_dscnt 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cooperative.atomic-workgroup.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cooperative.atomic-workgroup.ll index e86f0e0083805..2b04ab5ab8a00 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cooperative.atomic-workgroup.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cooperative.atomic-workgroup.ll @@ -124,6 +124,7 @@ define void @test_flat_amdgcn_cooperative_atomic_store_32x4B_release(ptr noundef ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_dscnt 0x0 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] @@ -137,6 +138,7 @@ define void @test_flat_amdgcn_cooperative_atomic_store_16x8B_release(ptr noundef ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: flat_store_b64 v[0:1], v[2:3] scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_dscnt 0x0 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] @@ -150,6 +152,7 @@ define void @test_flat_amdgcn_cooperative_atomic_store_8x16B_release(ptr noundef ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: flat_store_b128 v[0:1], v[2:5] scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_dscnt 0x0 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] @@ -163,6 +166,7 @@ define i32 @test_flat_amdgcn_cooperative_atomic_load_32x4B_seq_cst(ptr noundef r ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: flat_load_b32 v0, v[0:1] scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] @@ -176,6 +180,7 @@ define <2 x i32> @test_flat_amdgcn_cooperative_atomic_load_16x8B_seq_cst(ptr nou ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: flat_load_b64 v[0:1], v[0:1] scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] @@ -189,6 +194,7 @@ define <4 x i32> @test_flat_amdgcn_cooperative_atomic_load_8x16B_seq_cst(ptr nou ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: flat_load_b128 v[0:3], v[0:1] scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] @@ -202,6 +208,7 @@ define void @test_flat_amdgcn_cooperative_atomic_store_32x4B_seq_cst(ptr noundef ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_dscnt 0x0 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] @@ -215,6 +222,7 @@ define void @test_flat_amdgcn_cooperative_atomic_store_16x8B_seq_cst(ptr noundef ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: flat_store_b64 v[0:1], v[2:3] scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_dscnt 0x0 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] @@ -228,6 +236,7 @@ define void @test_flat_amdgcn_cooperative_atomic_store_8x16B_seq_cst(ptr noundef ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: flat_store_b128 v[0:1], v[2:5] scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_dscnt 0x0 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] @@ -358,6 +367,7 @@ define void @test_one_as_flat_amdgcn_cooperative_atomic_store_32x4B_release(ptr ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_dscnt 0x0 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] @@ -371,6 +381,7 @@ define void @test_one_as_flat_amdgcn_cooperative_atomic_store_16x8B_release(ptr ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: flat_store_b64 v[0:1], v[2:3] scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_dscnt 0x0 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] @@ -384,6 +395,7 @@ define void @test_one_as_flat_amdgcn_cooperative_atomic_store_8x16B_release(ptr ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: flat_store_b128 v[0:1], v[2:5] scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_dscnt 0x0 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] @@ -397,6 +409,7 @@ define i32 @test_one_as_flat_amdgcn_cooperative_atomic_load_32x4B_seq_cst(ptr no ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: flat_load_b32 v0, v[0:1] scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] @@ -410,6 +423,7 @@ define <2 x i32> @test_one_as_flat_amdgcn_cooperative_atomic_load_16x8B_seq_cst( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: flat_load_b64 v[0:1], v[0:1] scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] @@ -423,6 +437,7 @@ define <4 x i32> @test_one_as_flat_amdgcn_cooperative_atomic_load_8x16B_seq_cst( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: flat_load_b128 v[0:3], v[0:1] scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] @@ -436,6 +451,7 @@ define void @test_one_as_flat_amdgcn_cooperative_atomic_store_32x4B_seq_cst(ptr ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_dscnt 0x0 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] @@ -449,6 +465,7 @@ define void @test_one_as_flat_amdgcn_cooperative_atomic_store_16x8B_seq_cst(ptr ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: flat_store_b64 v[0:1], v[2:3] scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_dscnt 0x0 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] @@ -462,6 +479,7 @@ define void @test_one_as_flat_amdgcn_cooperative_atomic_store_8x16B_seq_cst(ptr ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: flat_store_b128 v[0:1], v[2:5] scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_dscnt 0x0 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-barriers.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-barriers.ll index e921f581c00a7..516c3946f63dc 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-barriers.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-barriers.ll @@ -45,7 +45,6 @@ define amdgpu_kernel void @test_s_barrier() { ; ; GFX1250-LABEL: test_s_barrier: ; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: s_wait_alu 0xffe3 ; GFX1250-NEXT: s_barrier_signal -1 ; GFX1250-NEXT: s_barrier_wait -1 ; GFX1250-NEXT: s_endpgm @@ -103,8 +102,8 @@ define amdgpu_kernel void @test_s_barrier_workgroup_fence() { ; ; GFX1250-LABEL: test_s_barrier_workgroup_fence: ; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: s_wait_dscnt 0x0 -; GFX1250-NEXT: s_wait_alu 0xffe3 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_barrier_signal -1 ; GFX1250-NEXT: s_barrier_wait -1 ; GFX1250-NEXT: s_endpgm @@ -168,11 +167,9 @@ define amdgpu_kernel void @test_s_barrier_agent_fence() { ; ; GFX1250-LABEL: test_s_barrier_agent_fence: ; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: s_wait_alu 0xffe3 ; GFX1250-NEXT: s_barrier_signal -1 ; GFX1250-NEXT: s_barrier_wait -1 ; GFX1250-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-agent.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-agent.ll index 4ca0cc92e09be..8ac3414da7354 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-agent.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-agent.ll @@ -804,13 +804,9 @@ define amdgpu_kernel void @private_agent_seq_cst_load( ; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_load_b32 v0, v[0:1] scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE @@ -992,6 +988,7 @@ define amdgpu_kernel void @private_agent_unordered_store( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SE ; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(5) %out) { @@ -1170,6 +1167,7 @@ define amdgpu_kernel void @private_agent_monotonic_store( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_DEV ; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(5) %out) { @@ -1348,9 +1346,9 @@ define amdgpu_kernel void @private_agent_release_store( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_DEV ; GFX1250-NEXT: s_endpgm @@ -1530,9 +1528,9 @@ define amdgpu_kernel void @private_agent_seq_cst_store( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_DEV ; GFX1250-NEXT: s_endpgm @@ -1712,6 +1710,7 @@ define amdgpu_kernel void @private_agent_monotonic_atomicrmw( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_DEV ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %out, i32 %in) { @@ -1890,6 +1889,7 @@ define amdgpu_kernel void @private_agent_acquire_atomicrmw( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV @@ -2070,9 +2070,9 @@ define amdgpu_kernel void @private_agent_release_atomicrmw( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_DEV ; GFX1250-NEXT: s_endpgm @@ -2252,9 +2252,9 @@ define amdgpu_kernel void @private_agent_acq_rel_atomicrmw( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 @@ -2436,9 +2436,9 @@ define amdgpu_kernel void @private_agent_seq_cst_atomicrmw( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 @@ -2674,6 +2674,7 @@ define amdgpu_kernel void @private_agent_acquire_ret_atomicrmw( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV @@ -2910,13 +2911,11 @@ define amdgpu_kernel void @private_agent_acq_rel_ret_atomicrmw( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE @@ -3152,13 +3151,11 @@ define amdgpu_kernel void @private_agent_seq_cst_ret_atomicrmw( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE @@ -3430,6 +3427,7 @@ define amdgpu_kernel void @private_agent_monotonic_monotonic_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_DEV ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %out, i32 %in, i32 %old) { @@ -3699,6 +3697,7 @@ define amdgpu_kernel void @private_agent_acquire_monotonic_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV @@ -3970,9 +3969,9 @@ define amdgpu_kernel void @private_agent_release_monotonic_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_DEV ; GFX1250-NEXT: s_endpgm @@ -4243,9 +4242,9 @@ define amdgpu_kernel void @private_agent_acq_rel_monotonic_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 @@ -4518,9 +4517,9 @@ define amdgpu_kernel void @private_agent_seq_cst_monotonic_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 @@ -4793,6 +4792,7 @@ define amdgpu_kernel void @private_agent_monotonic_acquire_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV @@ -5064,6 +5064,7 @@ define amdgpu_kernel void @private_agent_acquire_acquire_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV @@ -5335,9 +5336,9 @@ define amdgpu_kernel void @private_agent_release_acquire_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 @@ -5610,9 +5611,9 @@ define amdgpu_kernel void @private_agent_acq_rel_acquire_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 @@ -5885,9 +5886,9 @@ define amdgpu_kernel void @private_agent_seq_cst_acquire_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 @@ -6160,9 +6161,9 @@ define amdgpu_kernel void @private_agent_monotonic_seq_cst_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 @@ -6435,9 +6436,9 @@ define amdgpu_kernel void @private_agent_acquire_seq_cst_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 @@ -6710,9 +6711,9 @@ define amdgpu_kernel void @private_agent_release_seq_cst_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 @@ -6985,9 +6986,9 @@ define amdgpu_kernel void @private_agent_acq_rel_seq_cst_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 @@ -7260,9 +7261,9 @@ define amdgpu_kernel void @private_agent_seq_cst_seq_cst_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 @@ -7563,6 +7564,7 @@ define amdgpu_kernel void @private_agent_monotonic_monotonic_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s1 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE @@ -7864,6 +7866,7 @@ define amdgpu_kernel void @private_agent_acquire_monotonic_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s1 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV @@ -8166,9 +8169,9 @@ define amdgpu_kernel void @private_agent_release_monotonic_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s1 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -8471,13 +8474,11 @@ define amdgpu_kernel void @private_agent_acq_rel_monotonic_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s1 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE @@ -8779,13 +8780,11 @@ define amdgpu_kernel void @private_agent_seq_cst_monotonic_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s1 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE @@ -9087,9 +9086,8 @@ define amdgpu_kernel void @private_agent_monotonic_acquire_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s1 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE @@ -9391,6 +9389,7 @@ define amdgpu_kernel void @private_agent_acquire_acquire_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s1 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV @@ -9693,13 +9692,11 @@ define amdgpu_kernel void @private_agent_release_acquire_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s1 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE @@ -10001,13 +9998,11 @@ define amdgpu_kernel void @private_agent_acq_rel_acquire_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s1 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE @@ -10309,13 +10304,11 @@ define amdgpu_kernel void @private_agent_seq_cst_acquire_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s1 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE @@ -10617,13 +10610,11 @@ define amdgpu_kernel void @private_agent_monotonic_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s1 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE @@ -10925,9 +10916,9 @@ define amdgpu_kernel void @private_agent_acquire_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s1 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -11231,13 +11222,11 @@ define amdgpu_kernel void @private_agent_release_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s1 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE @@ -11539,13 +11528,11 @@ define amdgpu_kernel void @private_agent_acq_rel_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s1 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE @@ -11847,13 +11834,11 @@ define amdgpu_kernel void @private_agent_seq_cst_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s1 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE @@ -12658,13 +12643,9 @@ define amdgpu_kernel void @private_agent_one_as_seq_cst_load( ; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: flat_load_b32 v0, v[0:1] scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_dscnt 0x0 @@ -12847,6 +12828,7 @@ define amdgpu_kernel void @private_agent_one_as_unordered_store( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SE ; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(5) %out) { @@ -13025,6 +13007,7 @@ define amdgpu_kernel void @private_agent_one_as_monotonic_store( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_DEV ; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(5) %out) { @@ -13203,10 +13186,10 @@ define amdgpu_kernel void @private_agent_one_as_release_store( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_DEV ; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(5) %out) { @@ -13385,10 +13368,10 @@ define amdgpu_kernel void @private_agent_one_as_seq_cst_store( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_DEV ; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(5) %out) { @@ -13567,6 +13550,7 @@ define amdgpu_kernel void @private_agent_one_as_monotonic_atomicrmw( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_DEV ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %out, i32 %in) { @@ -13745,6 +13729,7 @@ define amdgpu_kernel void @private_agent_one_as_acquire_atomicrmw( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV @@ -13925,10 +13910,10 @@ define amdgpu_kernel void @private_agent_one_as_release_atomicrmw( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_DEV ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %out, i32 %in) { @@ -14107,10 +14092,10 @@ define amdgpu_kernel void @private_agent_one_as_acq_rel_atomicrmw( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV @@ -14291,10 +14276,10 @@ define amdgpu_kernel void @private_agent_one_as_seq_cst_atomicrmw( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV @@ -14529,6 +14514,7 @@ define amdgpu_kernel void @private_agent_one_as_acquire_ret_atomicrmw( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV @@ -14766,13 +14752,11 @@ define amdgpu_kernel void @private_agent_one_as_acq_rel_ret_atomicrmw( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_dscnt 0x0 @@ -15009,13 +14993,11 @@ define amdgpu_kernel void @private_agent_one_as_seq_cst_ret_atomicrmw( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_dscnt 0x0 @@ -15288,6 +15270,7 @@ define amdgpu_kernel void @private_agent_one_as_monotonic_monotonic_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_DEV ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %out, i32 %in, i32 %old) { @@ -15557,6 +15540,7 @@ define amdgpu_kernel void @private_agent_one_as_acquire_monotonic_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV @@ -15828,10 +15812,10 @@ define amdgpu_kernel void @private_agent_one_as_release_monotonic_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_DEV ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %out, i32 %in, i32 %old) { @@ -16101,10 +16085,10 @@ define amdgpu_kernel void @private_agent_one_as_acq_rel_monotonic_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV @@ -16376,10 +16360,10 @@ define amdgpu_kernel void @private_agent_one_as_seq_cst_monotonic_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV @@ -16651,6 +16635,7 @@ define amdgpu_kernel void @private_agent_one_as_monotonic_acquire_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV @@ -16922,6 +16907,7 @@ define amdgpu_kernel void @private_agent_one_as_acquire_acquire_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV @@ -17193,10 +17179,10 @@ define amdgpu_kernel void @private_agent_one_as_release_acquire_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV @@ -17468,10 +17454,10 @@ define amdgpu_kernel void @private_agent_one_as_acq_rel_acquire_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV @@ -17743,10 +17729,10 @@ define amdgpu_kernel void @private_agent_one_as_seq_cst_acquire_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV @@ -18018,10 +18004,10 @@ define amdgpu_kernel void @private_agent_one_as_monotonic_seq_cst_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV @@ -18293,10 +18279,10 @@ define amdgpu_kernel void @private_agent_one_as_acquire_seq_cst_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV @@ -18568,10 +18554,10 @@ define amdgpu_kernel void @private_agent_one_as_release_seq_cst_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV @@ -18843,10 +18829,10 @@ define amdgpu_kernel void @private_agent_one_as_acq_rel_seq_cst_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV @@ -19118,10 +19104,10 @@ define amdgpu_kernel void @private_agent_one_as_seq_cst_seq_cst_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV @@ -19421,6 +19407,7 @@ define amdgpu_kernel void @private_agent_one_as_monotonic_monotonic_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s1 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE @@ -19722,6 +19709,7 @@ define amdgpu_kernel void @private_agent_one_as_acquire_monotonic_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s1 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV @@ -20025,13 +20013,11 @@ define amdgpu_kernel void @private_agent_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s1 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_dscnt 0x0 @@ -20334,13 +20320,11 @@ define amdgpu_kernel void @private_agent_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s1 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_dscnt 0x0 @@ -20643,9 +20627,8 @@ define amdgpu_kernel void @private_agent_one_as_monotonic_acquire_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s1 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_dscnt 0x0 @@ -20948,6 +20931,7 @@ define amdgpu_kernel void @private_agent_one_as_acquire_acquire_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s1 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV @@ -21251,13 +21235,11 @@ define amdgpu_kernel void @private_agent_one_as_release_acquire_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s1 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_dscnt 0x0 @@ -21560,13 +21542,11 @@ define amdgpu_kernel void @private_agent_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s1 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_dscnt 0x0 @@ -21869,13 +21849,11 @@ define amdgpu_kernel void @private_agent_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s1 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_dscnt 0x0 @@ -22178,13 +22156,11 @@ define amdgpu_kernel void @private_agent_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s1 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_dscnt 0x0 @@ -22487,10 +22463,10 @@ define amdgpu_kernel void @private_agent_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s1 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV @@ -22794,13 +22770,11 @@ define amdgpu_kernel void @private_agent_one_as_release_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s1 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_dscnt 0x0 @@ -23103,13 +23077,11 @@ define amdgpu_kernel void @private_agent_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s1 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_dscnt 0x0 @@ -23412,13 +23384,11 @@ define amdgpu_kernel void @private_agent_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s1 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_dscnt 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-singlethread.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-singlethread.ll index e9ee6b4925a13..f5ba70e454823 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-singlethread.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-singlethread.ll @@ -984,6 +984,7 @@ define amdgpu_kernel void @private_singlethread_unordered_store( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SE ; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(5) %out) { @@ -1162,6 +1163,7 @@ define amdgpu_kernel void @private_singlethread_monotonic_store( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SE ; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(5) %out) { @@ -1340,6 +1342,7 @@ define amdgpu_kernel void @private_singlethread_release_store( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SE ; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(5) %out) { @@ -1518,6 +1521,7 @@ define amdgpu_kernel void @private_singlethread_seq_cst_store( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SE ; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(5) %out) { @@ -1696,6 +1700,7 @@ define amdgpu_kernel void @private_singlethread_monotonic_atomicrmw( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %out, i32 %in) { @@ -1874,6 +1879,7 @@ define amdgpu_kernel void @private_singlethread_acquire_atomicrmw( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %out, i32 %in) { @@ -2052,6 +2058,7 @@ define amdgpu_kernel void @private_singlethread_release_atomicrmw( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %out, i32 %in) { @@ -2230,6 +2237,7 @@ define amdgpu_kernel void @private_singlethread_acq_rel_atomicrmw( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %out, i32 %in) { @@ -2408,6 +2416,7 @@ define amdgpu_kernel void @private_singlethread_seq_cst_atomicrmw( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %out, i32 %in) { @@ -2640,6 +2649,7 @@ define amdgpu_kernel void @private_singlethread_acquire_ret_atomicrmw( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE @@ -2875,6 +2885,7 @@ define amdgpu_kernel void @private_singlethread_acq_rel_ret_atomicrmw( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE @@ -3110,6 +3121,7 @@ define amdgpu_kernel void @private_singlethread_seq_cst_ret_atomicrmw( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE @@ -3381,6 +3393,7 @@ define amdgpu_kernel void @private_singlethread_monotonic_monotonic_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %out, i32 %in, i32 %old) { @@ -3650,6 +3663,7 @@ define amdgpu_kernel void @private_singlethread_acquire_monotonic_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %out, i32 %in, i32 %old) { @@ -3919,6 +3933,7 @@ define amdgpu_kernel void @private_singlethread_release_monotonic_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %out, i32 %in, i32 %old) { @@ -4188,6 +4203,7 @@ define amdgpu_kernel void @private_singlethread_acq_rel_monotonic_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %out, i32 %in, i32 %old) { @@ -4457,6 +4473,7 @@ define amdgpu_kernel void @private_singlethread_seq_cst_monotonic_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %out, i32 %in, i32 %old) { @@ -4726,6 +4743,7 @@ define amdgpu_kernel void @private_singlethread_monotonic_acquire_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %out, i32 %in, i32 %old) { @@ -4995,6 +5013,7 @@ define amdgpu_kernel void @private_singlethread_acquire_acquire_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %out, i32 %in, i32 %old) { @@ -5264,6 +5283,7 @@ define amdgpu_kernel void @private_singlethread_release_acquire_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %out, i32 %in, i32 %old) { @@ -5533,6 +5553,7 @@ define amdgpu_kernel void @private_singlethread_acq_rel_acquire_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %out, i32 %in, i32 %old) { @@ -5802,6 +5823,7 @@ define amdgpu_kernel void @private_singlethread_seq_cst_acquire_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %out, i32 %in, i32 %old) { @@ -6071,6 +6093,7 @@ define amdgpu_kernel void @private_singlethread_monotonic_seq_cst_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %out, i32 %in, i32 %old) { @@ -6340,6 +6363,7 @@ define amdgpu_kernel void @private_singlethread_acquire_seq_cst_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %out, i32 %in, i32 %old) { @@ -6609,6 +6633,7 @@ define amdgpu_kernel void @private_singlethread_release_seq_cst_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %out, i32 %in, i32 %old) { @@ -6878,6 +6903,7 @@ define amdgpu_kernel void @private_singlethread_acq_rel_seq_cst_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %out, i32 %in, i32 %old) { @@ -7147,6 +7173,7 @@ define amdgpu_kernel void @private_singlethread_seq_cst_seq_cst_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %out, i32 %in, i32 %old) { @@ -7444,6 +7471,7 @@ define amdgpu_kernel void @private_singlethread_monotonic_monotonic_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s1 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE @@ -7745,6 +7773,7 @@ define amdgpu_kernel void @private_singlethread_acquire_monotonic_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s1 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE @@ -8046,6 +8075,7 @@ define amdgpu_kernel void @private_singlethread_release_monotonic_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s1 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE @@ -8347,6 +8377,7 @@ define amdgpu_kernel void @private_singlethread_acq_rel_monotonic_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s1 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE @@ -8648,6 +8679,7 @@ define amdgpu_kernel void @private_singlethread_seq_cst_monotonic_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s1 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE @@ -8949,6 +8981,7 @@ define amdgpu_kernel void @private_singlethread_monotonic_acquire_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s1 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE @@ -9250,6 +9283,7 @@ define amdgpu_kernel void @private_singlethread_acquire_acquire_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s1 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE @@ -9551,6 +9585,7 @@ define amdgpu_kernel void @private_singlethread_release_acquire_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s1 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE @@ -9852,6 +9887,7 @@ define amdgpu_kernel void @private_singlethread_acq_rel_acquire_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s1 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE @@ -10153,6 +10189,7 @@ define amdgpu_kernel void @private_singlethread_seq_cst_acquire_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s1 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE @@ -10454,6 +10491,7 @@ define amdgpu_kernel void @private_singlethread_monotonic_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s1 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE @@ -10755,6 +10793,7 @@ define amdgpu_kernel void @private_singlethread_acquire_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s1 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE @@ -11056,6 +11095,7 @@ define amdgpu_kernel void @private_singlethread_release_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s1 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE @@ -11357,6 +11397,7 @@ define amdgpu_kernel void @private_singlethread_acq_rel_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s1 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE @@ -11658,6 +11699,7 @@ define amdgpu_kernel void @private_singlethread_seq_cst_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s1 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE @@ -12641,6 +12683,7 @@ define amdgpu_kernel void @private_singlethread_one_as_unordered_store( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SE ; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(5) %out) { @@ -12819,6 +12862,7 @@ define amdgpu_kernel void @private_singlethread_one_as_monotonic_store( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SE ; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(5) %out) { @@ -12997,6 +13041,7 @@ define amdgpu_kernel void @private_singlethread_one_as_release_store( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SE ; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(5) %out) { @@ -13175,6 +13220,7 @@ define amdgpu_kernel void @private_singlethread_one_as_seq_cst_store( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SE ; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(5) %out) { @@ -13353,6 +13399,7 @@ define amdgpu_kernel void @private_singlethread_one_as_monotonic_atomicrmw( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %out, i32 %in) { @@ -13531,6 +13578,7 @@ define amdgpu_kernel void @private_singlethread_one_as_acquire_atomicrmw( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %out, i32 %in) { @@ -13709,6 +13757,7 @@ define amdgpu_kernel void @private_singlethread_one_as_release_atomicrmw( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %out, i32 %in) { @@ -13887,6 +13936,7 @@ define amdgpu_kernel void @private_singlethread_one_as_acq_rel_atomicrmw( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %out, i32 %in) { @@ -14065,6 +14115,7 @@ define amdgpu_kernel void @private_singlethread_one_as_seq_cst_atomicrmw( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %out, i32 %in) { @@ -14297,6 +14348,7 @@ define amdgpu_kernel void @private_singlethread_one_as_acquire_ret_atomicrmw( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE @@ -14532,6 +14584,7 @@ define amdgpu_kernel void @private_singlethread_one_as_acq_rel_ret_atomicrmw( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE @@ -14767,6 +14820,7 @@ define amdgpu_kernel void @private_singlethread_one_as_seq_cst_ret_atomicrmw( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE @@ -15038,6 +15092,7 @@ define amdgpu_kernel void @private_singlethread_one_as_monotonic_monotonic_cmpxc ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %out, i32 %in, i32 %old) { @@ -15307,6 +15362,7 @@ define amdgpu_kernel void @private_singlethread_one_as_acquire_monotonic_cmpxchg ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %out, i32 %in, i32 %old) { @@ -15576,6 +15632,7 @@ define amdgpu_kernel void @private_singlethread_one_as_release_monotonic_cmpxchg ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %out, i32 %in, i32 %old) { @@ -15845,6 +15902,7 @@ define amdgpu_kernel void @private_singlethread_one_as_acq_rel_monotonic_cmpxchg ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %out, i32 %in, i32 %old) { @@ -16114,6 +16172,7 @@ define amdgpu_kernel void @private_singlethread_one_as_seq_cst_monotonic_cmpxchg ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %out, i32 %in, i32 %old) { @@ -16383,6 +16442,7 @@ define amdgpu_kernel void @private_singlethread_one_as_monotonic_acquire_cmpxchg ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %out, i32 %in, i32 %old) { @@ -16652,6 +16712,7 @@ define amdgpu_kernel void @private_singlethread_one_as_acquire_acquire_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %out, i32 %in, i32 %old) { @@ -16921,6 +16982,7 @@ define amdgpu_kernel void @private_singlethread_one_as_release_acquire_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %out, i32 %in, i32 %old) { @@ -17190,6 +17252,7 @@ define amdgpu_kernel void @private_singlethread_one_as_acq_rel_acquire_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %out, i32 %in, i32 %old) { @@ -17459,6 +17522,7 @@ define amdgpu_kernel void @private_singlethread_one_as_seq_cst_acquire_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %out, i32 %in, i32 %old) { @@ -17728,6 +17792,7 @@ define amdgpu_kernel void @private_singlethread_one_as_monotonic_seq_cst_cmpxchg ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %out, i32 %in, i32 %old) { @@ -17997,6 +18062,7 @@ define amdgpu_kernel void @private_singlethread_one_as_acquire_seq_cst_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %out, i32 %in, i32 %old) { @@ -18266,6 +18332,7 @@ define amdgpu_kernel void @private_singlethread_one_as_release_seq_cst_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %out, i32 %in, i32 %old) { @@ -18535,6 +18602,7 @@ define amdgpu_kernel void @private_singlethread_one_as_acq_rel_seq_cst_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %out, i32 %in, i32 %old) { @@ -18804,6 +18872,7 @@ define amdgpu_kernel void @private_singlethread_one_as_seq_cst_seq_cst_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %out, i32 %in, i32 %old) { @@ -19101,6 +19170,7 @@ define amdgpu_kernel void @private_singlethread_one_as_monotonic_monotonic_ret_c ; GFX1250-NEXT: v_mov_b32_e32 v4, s1 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE @@ -19402,6 +19472,7 @@ define amdgpu_kernel void @private_singlethread_one_as_acquire_monotonic_ret_cmp ; GFX1250-NEXT: v_mov_b32_e32 v4, s1 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE @@ -19703,6 +19774,7 @@ define amdgpu_kernel void @private_singlethread_one_as_release_monotonic_ret_cmp ; GFX1250-NEXT: v_mov_b32_e32 v4, s1 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE @@ -20004,6 +20076,7 @@ define amdgpu_kernel void @private_singlethread_one_as_acq_rel_monotonic_ret_cmp ; GFX1250-NEXT: v_mov_b32_e32 v4, s1 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE @@ -20305,6 +20378,7 @@ define amdgpu_kernel void @private_singlethread_one_as_seq_cst_monotonic_ret_cmp ; GFX1250-NEXT: v_mov_b32_e32 v4, s1 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE @@ -20606,6 +20680,7 @@ define amdgpu_kernel void @private_singlethread_one_as_monotonic_acquire_ret_cmp ; GFX1250-NEXT: v_mov_b32_e32 v4, s1 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE @@ -20907,6 +20982,7 @@ define amdgpu_kernel void @private_singlethread_one_as_acquire_acquire_ret_cmpxc ; GFX1250-NEXT: v_mov_b32_e32 v4, s1 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE @@ -21208,6 +21284,7 @@ define amdgpu_kernel void @private_singlethread_one_as_release_acquire_ret_cmpxc ; GFX1250-NEXT: v_mov_b32_e32 v4, s1 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE @@ -21509,6 +21586,7 @@ define amdgpu_kernel void @private_singlethread_one_as_acq_rel_acquire_ret_cmpxc ; GFX1250-NEXT: v_mov_b32_e32 v4, s1 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE @@ -21810,6 +21888,7 @@ define amdgpu_kernel void @private_singlethread_one_as_seq_cst_acquire_ret_cmpxc ; GFX1250-NEXT: v_mov_b32_e32 v4, s1 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE @@ -22111,6 +22190,7 @@ define amdgpu_kernel void @private_singlethread_one_as_monotonic_seq_cst_ret_cmp ; GFX1250-NEXT: v_mov_b32_e32 v4, s1 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE @@ -22412,6 +22492,7 @@ define amdgpu_kernel void @private_singlethread_one_as_acquire_seq_cst_ret_cmpxc ; GFX1250-NEXT: v_mov_b32_e32 v4, s1 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE @@ -22713,6 +22794,7 @@ define amdgpu_kernel void @private_singlethread_one_as_release_seq_cst_ret_cmpxc ; GFX1250-NEXT: v_mov_b32_e32 v4, s1 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE @@ -23014,6 +23096,7 @@ define amdgpu_kernel void @private_singlethread_one_as_acq_rel_seq_cst_ret_cmpxc ; GFX1250-NEXT: v_mov_b32_e32 v4, s1 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE @@ -23315,6 +23398,7 @@ define amdgpu_kernel void @private_singlethread_one_as_seq_cst_seq_cst_ret_cmpxc ; GFX1250-NEXT: v_mov_b32_e32 v4, s1 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-system.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-system.ll index 24ec3a34c4e6e..1e2153f76bc03 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-system.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-system.ll @@ -804,13 +804,9 @@ define amdgpu_kernel void @private_system_seq_cst_load( ; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_load_b32 v0, v[0:1] scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE @@ -992,6 +988,7 @@ define amdgpu_kernel void @private_system_unordered_store( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SE ; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(5) %out) { @@ -1170,6 +1167,7 @@ define amdgpu_kernel void @private_system_monotonic_store( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SYS ; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(5) %out) { @@ -1349,9 +1347,8 @@ define amdgpu_kernel void @private_system_release_store( ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SYS ; GFX1250-NEXT: s_endpgm @@ -1532,9 +1529,8 @@ define amdgpu_kernel void @private_system_seq_cst_store( ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SYS ; GFX1250-NEXT: s_endpgm @@ -1714,6 +1710,7 @@ define amdgpu_kernel void @private_system_monotonic_atomicrmw( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_SYS ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %out, i32 %in) { @@ -1892,6 +1889,7 @@ define amdgpu_kernel void @private_system_acquire_atomicrmw( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS @@ -2073,9 +2071,8 @@ define amdgpu_kernel void @private_system_release_atomicrmw( ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_SYS ; GFX1250-NEXT: s_endpgm @@ -2256,9 +2253,8 @@ define amdgpu_kernel void @private_system_acq_rel_atomicrmw( ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 @@ -2441,9 +2437,8 @@ define amdgpu_kernel void @private_system_seq_cst_atomicrmw( ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 @@ -2679,6 +2674,7 @@ define amdgpu_kernel void @private_system_acquire_ret_atomicrmw( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS @@ -2916,13 +2912,10 @@ define amdgpu_kernel void @private_system_acq_rel_ret_atomicrmw( ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s1 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE @@ -3159,13 +3152,10 @@ define amdgpu_kernel void @private_system_seq_cst_ret_atomicrmw( ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s1 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE @@ -3437,6 +3427,7 @@ define amdgpu_kernel void @private_system_monotonic_monotonic_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_SYS ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %out, i32 %in, i32 %old) { @@ -3706,6 +3697,7 @@ define amdgpu_kernel void @private_system_acquire_monotonic_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS @@ -3978,9 +3970,8 @@ define amdgpu_kernel void @private_system_release_monotonic_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_SYS ; GFX1250-NEXT: s_endpgm @@ -4252,9 +4243,8 @@ define amdgpu_kernel void @private_system_acq_rel_monotonic_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 @@ -4528,9 +4518,8 @@ define amdgpu_kernel void @private_system_seq_cst_monotonic_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 @@ -4803,6 +4792,7 @@ define amdgpu_kernel void @private_system_monotonic_acquire_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS @@ -5074,6 +5064,7 @@ define amdgpu_kernel void @private_system_acquire_acquire_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS @@ -5346,9 +5337,8 @@ define amdgpu_kernel void @private_system_release_acquire_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 @@ -5622,9 +5612,8 @@ define amdgpu_kernel void @private_system_acq_rel_acquire_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 @@ -5898,9 +5887,8 @@ define amdgpu_kernel void @private_system_seq_cst_acquire_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 @@ -6174,9 +6162,8 @@ define amdgpu_kernel void @private_system_seq_cst_seq_cst_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 @@ -6477,6 +6464,7 @@ define amdgpu_kernel void @private_system_monotonic_monotonic_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s1 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE @@ -6778,6 +6766,7 @@ define amdgpu_kernel void @private_system_acquire_monotonic_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s1 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS @@ -7081,13 +7070,10 @@ define amdgpu_kernel void @private_system_acq_rel_monotonic_ret_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE @@ -7390,13 +7376,10 @@ define amdgpu_kernel void @private_system_seq_cst_monotonic_ret_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE @@ -7698,9 +7681,8 @@ define amdgpu_kernel void @private_system_monotonic_acquire_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s1 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE @@ -8002,6 +7984,7 @@ define amdgpu_kernel void @private_system_acquire_acquire_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s1 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS @@ -8305,13 +8288,10 @@ define amdgpu_kernel void @private_system_release_acquire_ret_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE @@ -8614,13 +8594,10 @@ define amdgpu_kernel void @private_system_acq_rel_acquire_ret_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE @@ -8923,13 +8900,10 @@ define amdgpu_kernel void @private_system_seq_cst_acquire_ret_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE @@ -9232,13 +9206,10 @@ define amdgpu_kernel void @private_system_monotonic_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE @@ -9541,9 +9512,8 @@ define amdgpu_kernel void @private_system_acquire_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -9848,13 +9818,10 @@ define amdgpu_kernel void @private_system_relese_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE @@ -10157,13 +10124,10 @@ define amdgpu_kernel void @private_system_acq_rel_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE @@ -10466,13 +10430,10 @@ define amdgpu_kernel void @private_system_seq_cst_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE @@ -11277,13 +11238,9 @@ define amdgpu_kernel void @private_system_one_as_seq_cst_load( ; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: flat_load_b32 v0, v[0:1] scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_dscnt 0x0 @@ -11466,6 +11423,7 @@ define amdgpu_kernel void @private_system_one_as_unordered_store( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SE ; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(5) %out) { @@ -11644,6 +11602,7 @@ define amdgpu_kernel void @private_system_one_as_monotonic_store( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SYS ; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(5) %out) { @@ -11823,10 +11782,9 @@ define amdgpu_kernel void @private_system_one_as_release_store( ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SYS ; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(5) %out) { @@ -12006,10 +11964,9 @@ define amdgpu_kernel void @private_system_one_as_seq_cst_store( ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SYS ; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(5) %out) { @@ -12188,6 +12145,7 @@ define amdgpu_kernel void @private_system_one_as_monotonic_atomicrmw( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_SYS ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %out, i32 %in) { @@ -12366,6 +12324,7 @@ define amdgpu_kernel void @private_system_one_as_acquire_atomicrmw( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS @@ -12547,10 +12506,9 @@ define amdgpu_kernel void @private_system_one_as_release_atomicrmw( ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_SYS ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %out, i32 %in) { @@ -12730,10 +12688,9 @@ define amdgpu_kernel void @private_system_one_as_acq_rel_atomicrmw( ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS @@ -12915,10 +12872,9 @@ define amdgpu_kernel void @private_system_one_as_seq_cst_atomicrmw( ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS @@ -13153,6 +13109,7 @@ define amdgpu_kernel void @private_system_one_as_acquire_ret_atomicrmw( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS @@ -13391,13 +13348,10 @@ define amdgpu_kernel void @private_system_one_as_acq_rel_ret_atomicrmw( ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s1 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_dscnt 0x0 @@ -13635,13 +13589,10 @@ define amdgpu_kernel void @private_system_one_as_seq_cst_ret_atomicrmw( ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s1 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_dscnt 0x0 @@ -13914,6 +13865,7 @@ define amdgpu_kernel void @private_system_one_as_monotonic_monotonic_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_SYS ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %out, i32 %in, i32 %old) { @@ -14183,6 +14135,7 @@ define amdgpu_kernel void @private_system_one_as_acquire_monotonic_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS @@ -14455,10 +14408,9 @@ define amdgpu_kernel void @private_system_one_as_release_monotonic_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_SYS ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %out, i32 %in, i32 %old) { @@ -14729,10 +14681,9 @@ define amdgpu_kernel void @private_system_one_as_acq_rel_monotonic_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS @@ -15005,10 +14956,9 @@ define amdgpu_kernel void @private_system_one_as_seq_cst_monotonic_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS @@ -15280,6 +15230,7 @@ define amdgpu_kernel void @private_system_one_as_monotonic_acquire_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS @@ -15551,6 +15502,7 @@ define amdgpu_kernel void @private_system_one_as_acquire_acquire_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS @@ -15823,10 +15775,9 @@ define amdgpu_kernel void @private_system_one_as_release_acquire_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS @@ -16099,10 +16050,9 @@ define amdgpu_kernel void @private_system_one_as_acq_rel_acquire_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS @@ -16375,10 +16325,9 @@ define amdgpu_kernel void @private_system_one_as_seq_cst_acquire_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS @@ -16651,10 +16600,9 @@ define amdgpu_kernel void @private_system_one_as_monotonic_seq_cst_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS @@ -16927,10 +16875,9 @@ define amdgpu_kernel void @private_system_one_as_acquire_seq_cst_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS @@ -17203,10 +17150,9 @@ define amdgpu_kernel void @private_system_one_as_release_seq_cst_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS @@ -17479,10 +17425,9 @@ define amdgpu_kernel void @private_system_one_as_acq_rel_seq_cst_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS @@ -17755,10 +17700,9 @@ define amdgpu_kernel void @private_system_one_as_seq_cst_seq_cst_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS @@ -18058,6 +18002,7 @@ define amdgpu_kernel void @private_system_one_as_monotonic_monotonic_ret_cmpxchg ; GFX1250-NEXT: v_mov_b32_e32 v4, s1 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE @@ -18359,6 +18304,7 @@ define amdgpu_kernel void @private_system_one_as_acquire_monotonic_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s1 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS @@ -18663,10 +18609,9 @@ define amdgpu_kernel void @private_system_one_as_release_monotonic_ret_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE @@ -18969,13 +18914,10 @@ define amdgpu_kernel void @private_system_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_dscnt 0x0 @@ -19279,13 +19221,10 @@ define amdgpu_kernel void @private_system_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_dscnt 0x0 @@ -19588,9 +19527,8 @@ define amdgpu_kernel void @private_system_one_as_monotonic_acquire_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s1 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_dscnt 0x0 @@ -19893,6 +19831,7 @@ define amdgpu_kernel void @private_system_one_as_acquire_acquire_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s1 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS @@ -20197,13 +20136,10 @@ define amdgpu_kernel void @private_system_one_as_release_acquire_ret_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_dscnt 0x0 @@ -20507,13 +20443,10 @@ define amdgpu_kernel void @private_system_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_dscnt 0x0 @@ -20817,13 +20750,10 @@ define amdgpu_kernel void @private_system_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_dscnt 0x0 @@ -21127,13 +21057,10 @@ define amdgpu_kernel void @private_system_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_dscnt 0x0 @@ -21437,10 +21364,9 @@ define amdgpu_kernel void @private_system_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS @@ -21745,13 +21671,10 @@ define amdgpu_kernel void @private_system_one_as_release_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_dscnt 0x0 @@ -22055,13 +21978,10 @@ define amdgpu_kernel void @private_system_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_dscnt 0x0 @@ -22365,13 +22285,10 @@ define amdgpu_kernel void @private_system_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_dscnt 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-wavefront.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-wavefront.ll index 8b2254412c0c8..28d9d5dacd9e3 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-wavefront.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-wavefront.ll @@ -984,6 +984,7 @@ define amdgpu_kernel void @private_wavefront_unordered_store( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SE ; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(5) %out) { @@ -1162,6 +1163,7 @@ define amdgpu_kernel void @private_wavefront_monotonic_store( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SE ; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(5) %out) { @@ -1340,6 +1342,7 @@ define amdgpu_kernel void @private_wavefront_release_store( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SE ; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(5) %out) { @@ -1518,6 +1521,7 @@ define amdgpu_kernel void @private_wavefront_seq_cst_store( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SE ; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(5) %out) { @@ -1696,6 +1700,7 @@ define amdgpu_kernel void @private_wavefront_monotonic_atomicrmw( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %out, i32 %in) { @@ -1874,6 +1879,7 @@ define amdgpu_kernel void @private_wavefront_acquire_atomicrmw( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %out, i32 %in) { @@ -2052,6 +2058,7 @@ define amdgpu_kernel void @private_wavefront_release_atomicrmw( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %out, i32 %in) { @@ -2230,6 +2237,7 @@ define amdgpu_kernel void @private_wavefront_acq_rel_atomicrmw( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %out, i32 %in) { @@ -2408,6 +2416,7 @@ define amdgpu_kernel void @private_wavefront_seq_cst_atomicrmw( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %out, i32 %in) { @@ -2640,6 +2649,7 @@ define amdgpu_kernel void @private_wavefront_acquire_ret_atomicrmw( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE @@ -2875,6 +2885,7 @@ define amdgpu_kernel void @private_wavefront_acq_rel_ret_atomicrmw( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE @@ -3110,6 +3121,7 @@ define amdgpu_kernel void @private_wavefront_seq_cst_ret_atomicrmw( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE @@ -3381,6 +3393,7 @@ define amdgpu_kernel void @private_wavefront_monotonic_monotonic_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %out, i32 %in, i32 %old) { @@ -3650,6 +3663,7 @@ define amdgpu_kernel void @private_wavefront_acquire_monotonic_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %out, i32 %in, i32 %old) { @@ -3919,6 +3933,7 @@ define amdgpu_kernel void @private_wavefront_release_monotonic_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %out, i32 %in, i32 %old) { @@ -4188,6 +4203,7 @@ define amdgpu_kernel void @private_wavefront_acq_rel_monotonic_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %out, i32 %in, i32 %old) { @@ -4457,6 +4473,7 @@ define amdgpu_kernel void @private_wavefront_seq_cst_monotonic_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %out, i32 %in, i32 %old) { @@ -4726,6 +4743,7 @@ define amdgpu_kernel void @private_wavefront_monotonic_acquire_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %out, i32 %in, i32 %old) { @@ -4995,6 +5013,7 @@ define amdgpu_kernel void @private_wavefront_acquire_acquire_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %out, i32 %in, i32 %old) { @@ -5264,6 +5283,7 @@ define amdgpu_kernel void @private_wavefront_release_acquire_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %out, i32 %in, i32 %old) { @@ -5533,6 +5553,7 @@ define amdgpu_kernel void @private_wavefront_acq_rel_acquire_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %out, i32 %in, i32 %old) { @@ -5802,6 +5823,7 @@ define amdgpu_kernel void @private_wavefront_seq_cst_acquire_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %out, i32 %in, i32 %old) { @@ -6071,6 +6093,7 @@ define amdgpu_kernel void @private_wavefront_monotonic_seq_cst_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %out, i32 %in, i32 %old) { @@ -6340,6 +6363,7 @@ define amdgpu_kernel void @private_wavefront_acquire_seq_cst_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %out, i32 %in, i32 %old) { @@ -6609,6 +6633,7 @@ define amdgpu_kernel void @private_wavefront_release_seq_cst_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %out, i32 %in, i32 %old) { @@ -6878,6 +6903,7 @@ define amdgpu_kernel void @private_wavefront_acq_rel_seq_cst_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %out, i32 %in, i32 %old) { @@ -7147,6 +7173,7 @@ define amdgpu_kernel void @private_wavefront_seq_cst_seq_cst_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %out, i32 %in, i32 %old) { @@ -7444,6 +7471,7 @@ define amdgpu_kernel void @private_wavefront_monotonic_monotonic_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s1 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE @@ -7745,6 +7773,7 @@ define amdgpu_kernel void @private_wavefront_acquire_monotonic_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s1 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE @@ -8046,6 +8075,7 @@ define amdgpu_kernel void @private_wavefront_release_monotonic_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s1 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE @@ -8347,6 +8377,7 @@ define amdgpu_kernel void @private_wavefront_acq_rel_monotonic_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s1 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE @@ -8648,6 +8679,7 @@ define amdgpu_kernel void @private_wavefront_seq_cst_monotonic_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s1 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE @@ -8949,6 +8981,7 @@ define amdgpu_kernel void @private_wavefront_monotonic_acquire_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s1 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE @@ -9250,6 +9283,7 @@ define amdgpu_kernel void @private_wavefront_acquire_acquire_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s1 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE @@ -9551,6 +9585,7 @@ define amdgpu_kernel void @private_wavefront_release_acquire_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s1 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE @@ -9852,6 +9887,7 @@ define amdgpu_kernel void @private_wavefront_acq_rel_acquire_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s1 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE @@ -10153,6 +10189,7 @@ define amdgpu_kernel void @private_wavefront_seq_cst_acquire_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s1 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE @@ -10454,6 +10491,7 @@ define amdgpu_kernel void @private_wavefront_monotonic_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s1 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE @@ -10755,6 +10793,7 @@ define amdgpu_kernel void @private_wavefront_acquire_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s1 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE @@ -11056,6 +11095,7 @@ define amdgpu_kernel void @private_wavefront_release_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s1 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE @@ -11357,6 +11397,7 @@ define amdgpu_kernel void @private_wavefront_acq_rel_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s1 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE @@ -11658,6 +11699,7 @@ define amdgpu_kernel void @private_wavefront_seq_cst_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s1 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE @@ -12641,6 +12683,7 @@ define amdgpu_kernel void @private_wavefront_one_as_unordered_store( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SE ; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(5) %out) { @@ -12819,6 +12862,7 @@ define amdgpu_kernel void @private_wavefront_one_as_monotonic_store( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SE ; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(5) %out) { @@ -12997,6 +13041,7 @@ define amdgpu_kernel void @private_wavefront_one_as_release_store( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SE ; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(5) %out) { @@ -13175,6 +13220,7 @@ define amdgpu_kernel void @private_wavefront_one_as_seq_cst_store( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SE ; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(5) %out) { @@ -13353,6 +13399,7 @@ define amdgpu_kernel void @private_wavefront_one_as_monotonic_atomicrmw( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %out, i32 %in) { @@ -13531,6 +13578,7 @@ define amdgpu_kernel void @private_wavefront_one_as_acquire_atomicrmw( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %out, i32 %in) { @@ -13709,6 +13757,7 @@ define amdgpu_kernel void @private_wavefront_one_as_release_atomicrmw( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %out, i32 %in) { @@ -13887,6 +13936,7 @@ define amdgpu_kernel void @private_wavefront_one_as_acq_rel_atomicrmw( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %out, i32 %in) { @@ -14065,6 +14115,7 @@ define amdgpu_kernel void @private_wavefront_one_as_seq_cst_atomicrmw( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %out, i32 %in) { @@ -14297,6 +14348,7 @@ define amdgpu_kernel void @private_wavefront_one_as_acquire_ret_atomicrmw( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE @@ -14532,6 +14584,7 @@ define amdgpu_kernel void @private_wavefront_one_as_acq_rel_ret_atomicrmw( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE @@ -14767,6 +14820,7 @@ define amdgpu_kernel void @private_wavefront_one_as_seq_cst_ret_atomicrmw( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE @@ -15038,6 +15092,7 @@ define amdgpu_kernel void @private_wavefront_one_as_monotonic_monotonic_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %out, i32 %in, i32 %old) { @@ -15307,6 +15362,7 @@ define amdgpu_kernel void @private_wavefront_one_as_acquire_monotonic_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %out, i32 %in, i32 %old) { @@ -15576,6 +15632,7 @@ define amdgpu_kernel void @private_wavefront_one_as_release_monotonic_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %out, i32 %in, i32 %old) { @@ -15845,6 +15902,7 @@ define amdgpu_kernel void @private_wavefront_one_as_acq_rel_monotonic_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %out, i32 %in, i32 %old) { @@ -16114,6 +16172,7 @@ define amdgpu_kernel void @private_wavefront_one_as_seq_cst_monotonic_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %out, i32 %in, i32 %old) { @@ -16383,6 +16442,7 @@ define amdgpu_kernel void @private_wavefront_one_as_monotonic_acquire_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %out, i32 %in, i32 %old) { @@ -16652,6 +16712,7 @@ define amdgpu_kernel void @private_wavefront_one_as_acquire_acquire_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %out, i32 %in, i32 %old) { @@ -16921,6 +16982,7 @@ define amdgpu_kernel void @private_wavefront_one_as_release_acquire_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %out, i32 %in, i32 %old) { @@ -17190,6 +17252,7 @@ define amdgpu_kernel void @private_wavefront_one_as_acq_rel_acquire_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %out, i32 %in, i32 %old) { @@ -17459,6 +17522,7 @@ define amdgpu_kernel void @private_wavefront_one_as_seq_cst_acquire_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %out, i32 %in, i32 %old) { @@ -17728,6 +17792,7 @@ define amdgpu_kernel void @private_wavefront_one_as_monotonic_seq_cst_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %out, i32 %in, i32 %old) { @@ -17997,6 +18062,7 @@ define amdgpu_kernel void @private_wavefront_one_as_acquire_seq_cst_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %out, i32 %in, i32 %old) { @@ -18266,6 +18332,7 @@ define amdgpu_kernel void @private_wavefront_one_as_release_seq_cst_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %out, i32 %in, i32 %old) { @@ -18535,6 +18602,7 @@ define amdgpu_kernel void @private_wavefront_one_as_acq_rel_seq_cst_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %out, i32 %in, i32 %old) { @@ -18804,6 +18872,7 @@ define amdgpu_kernel void @private_wavefront_one_as_seq_cst_seq_cst_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %out, i32 %in, i32 %old) { @@ -19101,6 +19170,7 @@ define amdgpu_kernel void @private_wavefront_one_as_monotonic_monotonic_ret_cmpx ; GFX1250-NEXT: v_mov_b32_e32 v4, s1 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE @@ -19402,6 +19472,7 @@ define amdgpu_kernel void @private_wavefront_one_as_acquire_monotonic_ret_cmpxch ; GFX1250-NEXT: v_mov_b32_e32 v4, s1 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE @@ -19703,6 +19774,7 @@ define amdgpu_kernel void @private_wavefront_one_as_release_monotonic_ret_cmpxch ; GFX1250-NEXT: v_mov_b32_e32 v4, s1 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE @@ -20004,6 +20076,7 @@ define amdgpu_kernel void @private_wavefront_one_as_acq_rel_monotonic_ret_cmpxch ; GFX1250-NEXT: v_mov_b32_e32 v4, s1 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE @@ -20305,6 +20378,7 @@ define amdgpu_kernel void @private_wavefront_one_as_seq_cst_monotonic_ret_cmpxch ; GFX1250-NEXT: v_mov_b32_e32 v4, s1 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE @@ -20606,6 +20680,7 @@ define amdgpu_kernel void @private_wavefront_one_as_monotonic_acquire_ret_cmpxch ; GFX1250-NEXT: v_mov_b32_e32 v4, s1 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE @@ -20907,6 +20982,7 @@ define amdgpu_kernel void @private_wavefront_one_as_acquire_acquire_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s1 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE @@ -21208,6 +21284,7 @@ define amdgpu_kernel void @private_wavefront_one_as_release_acquire_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s1 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE @@ -21509,6 +21586,7 @@ define amdgpu_kernel void @private_wavefront_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s1 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE @@ -21810,6 +21888,7 @@ define amdgpu_kernel void @private_wavefront_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s1 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE @@ -22111,6 +22190,7 @@ define amdgpu_kernel void @private_wavefront_one_as_monotonic_seq_cst_ret_cmpxch ; GFX1250-NEXT: v_mov_b32_e32 v4, s1 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE @@ -22412,6 +22492,7 @@ define amdgpu_kernel void @private_wavefront_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s1 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE @@ -22713,6 +22794,7 @@ define amdgpu_kernel void @private_wavefront_one_as_release_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s1 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE @@ -23014,6 +23096,7 @@ define amdgpu_kernel void @private_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s1 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE @@ -23315,6 +23398,7 @@ define amdgpu_kernel void @private_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s1 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-workgroup.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-workgroup.ll index 127434c365f95..01b2f6835cf7b 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-workgroup.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-workgroup.ll @@ -803,7 +803,8 @@ define amdgpu_kernel void @private_workgroup_seq_cst_load( ; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 -; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_load_b32 v0, v[0:1] ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE @@ -985,6 +986,7 @@ define amdgpu_kernel void @private_workgroup_unordered_store( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SE ; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(5) %out) { @@ -1163,6 +1165,7 @@ define amdgpu_kernel void @private_workgroup_monotonic_store( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SE ; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(5) %out) { @@ -1341,7 +1344,9 @@ define amdgpu_kernel void @private_workgroup_release_store( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SE ; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(5) %out) { @@ -1520,7 +1525,9 @@ define amdgpu_kernel void @private_workgroup_seq_cst_store( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SE ; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(5) %out) { @@ -1699,6 +1706,7 @@ define amdgpu_kernel void @private_workgroup_monotonic_atomicrmw( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %out, i32 %in) { @@ -1877,8 +1885,9 @@ define amdgpu_kernel void @private_workgroup_acquire_atomicrmw( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v[0:1], v2 -; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %out, i32 %in) { entry: @@ -2056,7 +2065,9 @@ define amdgpu_kernel void @private_workgroup_release_atomicrmw( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %out, i32 %in) { @@ -2235,9 +2246,11 @@ define amdgpu_kernel void @private_workgroup_acq_rel_atomicrmw( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v[0:1], v2 -; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %out, i32 %in) { entry: @@ -2415,9 +2428,11 @@ define amdgpu_kernel void @private_workgroup_seq_cst_atomicrmw( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v[0:1], v2 -; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %out, i32 %in) { entry: @@ -2649,6 +2664,7 @@ define amdgpu_kernel void @private_workgroup_acquire_ret_atomicrmw( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE @@ -2884,7 +2900,9 @@ define amdgpu_kernel void @private_workgroup_acq_rel_ret_atomicrmw( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE @@ -3120,7 +3138,9 @@ define amdgpu_kernel void @private_workgroup_seq_cst_ret_atomicrmw( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE @@ -3392,6 +3412,7 @@ define amdgpu_kernel void @private_workgroup_monotonic_monotonic_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %out, i32 %in, i32 %old) { @@ -3661,8 +3682,9 @@ define amdgpu_kernel void @private_workgroup_acquire_monotonic_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] -; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %out, i32 %in, i32 %old) { entry: @@ -3931,7 +3953,9 @@ define amdgpu_kernel void @private_workgroup_release_monotonic_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 -; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %out, i32 %in, i32 %old) { @@ -4201,9 +4225,11 @@ define amdgpu_kernel void @private_workgroup_acq_rel_monotonic_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 -; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] -; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %out, i32 %in, i32 %old) { entry: @@ -4472,9 +4498,11 @@ define amdgpu_kernel void @private_workgroup_seq_cst_monotonic_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 -; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] -; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %out, i32 %in, i32 %old) { entry: @@ -4743,8 +4771,9 @@ define amdgpu_kernel void @private_workgroup_monotonic_acquire_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] -; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %out, i32 %in, i32 %old) { entry: @@ -5013,8 +5042,9 @@ define amdgpu_kernel void @private_workgroup_acquire_acquire_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] -; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %out, i32 %in, i32 %old) { entry: @@ -5283,9 +5313,11 @@ define amdgpu_kernel void @private_workgroup_release_acquire_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 -; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] -; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %out, i32 %in, i32 %old) { entry: @@ -5554,9 +5586,11 @@ define amdgpu_kernel void @private_workgroup_acq_rel_acquire_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 -; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] -; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %out, i32 %in, i32 %old) { entry: @@ -5825,9 +5859,11 @@ define amdgpu_kernel void @private_workgroup_seq_cst_acquire_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 -; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] -; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %out, i32 %in, i32 %old) { entry: @@ -6096,9 +6132,11 @@ define amdgpu_kernel void @private_workgroup_monotonic_seq_cst_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 -; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] -; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %out, i32 %in, i32 %old) { entry: @@ -6367,9 +6405,11 @@ define amdgpu_kernel void @private_workgroup_acquire_seq_cst_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 -; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] -; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %out, i32 %in, i32 %old) { entry: @@ -6638,9 +6678,11 @@ define amdgpu_kernel void @private_workgroup_release_seq_cst_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 -; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] -; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %out, i32 %in, i32 %old) { entry: @@ -6909,9 +6951,11 @@ define amdgpu_kernel void @private_workgroup_acq_rel_seq_cst_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 -; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] -; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %out, i32 %in, i32 %old) { entry: @@ -7180,9 +7224,11 @@ define amdgpu_kernel void @private_workgroup_seq_cst_seq_cst_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 -; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] -; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %out, i32 %in, i32 %old) { entry: @@ -7479,6 +7525,7 @@ define amdgpu_kernel void @private_workgroup_monotonic_monotonic_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s1 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE @@ -7780,6 +7827,7 @@ define amdgpu_kernel void @private_workgroup_acquire_monotonic_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s1 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE @@ -8081,7 +8129,9 @@ define amdgpu_kernel void @private_workgroup_release_monotonic_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s1 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 -; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE @@ -8383,7 +8433,9 @@ define amdgpu_kernel void @private_workgroup_acq_rel_monotonic_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s1 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 -; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE @@ -8685,7 +8737,9 @@ define amdgpu_kernel void @private_workgroup_seq_cst_monotonic_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s1 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 -; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE @@ -8987,6 +9041,7 @@ define amdgpu_kernel void @private_workgroup_monotonic_acquire_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s1 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE @@ -9288,6 +9343,7 @@ define amdgpu_kernel void @private_workgroup_acquire_acquire_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s1 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE @@ -9589,7 +9645,9 @@ define amdgpu_kernel void @private_workgroup_release_acquire_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s1 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 -; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE @@ -9891,7 +9949,9 @@ define amdgpu_kernel void @private_workgroup_acq_rel_acquire_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s1 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 -; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE @@ -10193,7 +10253,9 @@ define amdgpu_kernel void @private_workgroup_seq_cst_acquire_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s1 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 -; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE @@ -10495,7 +10557,9 @@ define amdgpu_kernel void @private_workgroup_monotonic_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s1 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 -; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE @@ -10797,7 +10861,9 @@ define amdgpu_kernel void @private_workgroup_acquire_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s1 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 -; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE @@ -11099,7 +11165,9 @@ define amdgpu_kernel void @private_workgroup_release_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s1 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 -; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE @@ -11401,7 +11469,9 @@ define amdgpu_kernel void @private_workgroup_acq_rel_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s1 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 -; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE @@ -11703,7 +11773,9 @@ define amdgpu_kernel void @private_workgroup_seq_cst_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s1 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 -; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE @@ -12506,6 +12578,8 @@ define amdgpu_kernel void @private_workgroup_one_as_seq_cst_load( ; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: flat_load_b32 v0, v[0:1] ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE @@ -12687,6 +12761,7 @@ define amdgpu_kernel void @private_workgroup_one_as_unordered_store( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SE ; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(5) %out) { @@ -12865,6 +12940,7 @@ define amdgpu_kernel void @private_workgroup_one_as_monotonic_store( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SE ; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(5) %out) { @@ -13043,6 +13119,9 @@ define amdgpu_kernel void @private_workgroup_one_as_release_store( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SE ; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(5) %out) { @@ -13221,6 +13300,9 @@ define amdgpu_kernel void @private_workgroup_one_as_seq_cst_store( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SE ; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(5) %out) { @@ -13399,6 +13481,7 @@ define amdgpu_kernel void @private_workgroup_one_as_monotonic_atomicrmw( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %out, i32 %in) { @@ -13577,7 +13660,9 @@ define amdgpu_kernel void @private_workgroup_one_as_acquire_atomicrmw( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v[0:1], v2 +; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %out, i32 %in) { entry: @@ -13755,6 +13840,9 @@ define amdgpu_kernel void @private_workgroup_one_as_release_atomicrmw( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %out, i32 %in) { @@ -13933,7 +14021,11 @@ define amdgpu_kernel void @private_workgroup_one_as_acq_rel_atomicrmw( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v[0:1], v2 +; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %out, i32 %in) { entry: @@ -14111,7 +14203,11 @@ define amdgpu_kernel void @private_workgroup_one_as_seq_cst_atomicrmw( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v[0:1], v2 +; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %out, i32 %in) { entry: @@ -14343,6 +14439,7 @@ define amdgpu_kernel void @private_workgroup_one_as_acquire_ret_atomicrmw( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE @@ -14578,6 +14675,9 @@ define amdgpu_kernel void @private_workgroup_one_as_acq_rel_ret_atomicrmw( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE @@ -14813,6 +14913,9 @@ define amdgpu_kernel void @private_workgroup_one_as_seq_cst_ret_atomicrmw( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE @@ -15084,6 +15187,7 @@ define amdgpu_kernel void @private_workgroup_one_as_monotonic_monotonic_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %out, i32 %in, i32 %old) { @@ -15353,7 +15457,9 @@ define amdgpu_kernel void @private_workgroup_one_as_acquire_monotonic_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] +; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %out, i32 %in, i32 %old) { entry: @@ -15622,6 +15728,9 @@ define amdgpu_kernel void @private_workgroup_one_as_release_monotonic_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %out, i32 %in, i32 %old) { @@ -15891,7 +16000,11 @@ define amdgpu_kernel void @private_workgroup_one_as_acq_rel_monotonic_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] +; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %out, i32 %in, i32 %old) { entry: @@ -16160,7 +16273,11 @@ define amdgpu_kernel void @private_workgroup_one_as_seq_cst_monotonic_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] +; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %out, i32 %in, i32 %old) { entry: @@ -16429,7 +16546,9 @@ define amdgpu_kernel void @private_workgroup_one_as_monotonic_acquire_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] +; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %out, i32 %in, i32 %old) { entry: @@ -16698,7 +16817,9 @@ define amdgpu_kernel void @private_workgroup_one_as_acquire_acquire_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] +; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %out, i32 %in, i32 %old) { entry: @@ -16967,7 +17088,11 @@ define amdgpu_kernel void @private_workgroup_one_as_release_acquire_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] +; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %out, i32 %in, i32 %old) { entry: @@ -17236,7 +17361,11 @@ define amdgpu_kernel void @private_workgroup_one_as_acq_rel_acquire_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] +; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %out, i32 %in, i32 %old) { entry: @@ -17505,7 +17634,11 @@ define amdgpu_kernel void @private_workgroup_one_as_seq_cst_acquire_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] +; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %out, i32 %in, i32 %old) { entry: @@ -17774,7 +17907,11 @@ define amdgpu_kernel void @private_workgroup_one_as_monotonic_seq_cst_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] +; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %out, i32 %in, i32 %old) { entry: @@ -18043,7 +18180,11 @@ define amdgpu_kernel void @private_workgroup_one_as_acquire_seq_cst_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] +; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %out, i32 %in, i32 %old) { entry: @@ -18312,7 +18453,11 @@ define amdgpu_kernel void @private_workgroup_one_as_release_seq_cst_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] +; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %out, i32 %in, i32 %old) { entry: @@ -18581,7 +18726,11 @@ define amdgpu_kernel void @private_workgroup_one_as_acq_rel_seq_cst_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] +; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %out, i32 %in, i32 %old) { entry: @@ -18850,7 +18999,11 @@ define amdgpu_kernel void @private_workgroup_one_as_seq_cst_seq_cst_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] +; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %out, i32 %in, i32 %old) { entry: @@ -19147,6 +19300,7 @@ define amdgpu_kernel void @private_workgroup_one_as_monotonic_monotonic_ret_cmpx ; GFX1250-NEXT: v_mov_b32_e32 v4, s1 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE @@ -19448,6 +19602,7 @@ define amdgpu_kernel void @private_workgroup_one_as_acquire_monotonic_ret_cmpxch ; GFX1250-NEXT: v_mov_b32_e32 v4, s1 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE @@ -19749,6 +19904,9 @@ define amdgpu_kernel void @private_workgroup_one_as_release_monotonic_ret_cmpxch ; GFX1250-NEXT: v_mov_b32_e32 v4, s1 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE @@ -20050,6 +20208,9 @@ define amdgpu_kernel void @private_workgroup_one_as_acq_rel_monotonic_ret_cmpxch ; GFX1250-NEXT: v_mov_b32_e32 v4, s1 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE @@ -20351,6 +20512,9 @@ define amdgpu_kernel void @private_workgroup_one_as_seq_cst_monotonic_ret_cmpxch ; GFX1250-NEXT: v_mov_b32_e32 v4, s1 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE @@ -20652,6 +20816,7 @@ define amdgpu_kernel void @private_workgroup_one_as_monotonic_acquire_ret_cmpxch ; GFX1250-NEXT: v_mov_b32_e32 v4, s1 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE @@ -20953,6 +21118,7 @@ define amdgpu_kernel void @private_workgroup_one_as_acquire_acquire_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s1 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE @@ -21254,6 +21420,9 @@ define amdgpu_kernel void @private_workgroup_one_as_release_acquire_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s1 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE @@ -21555,6 +21724,9 @@ define amdgpu_kernel void @private_workgroup_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s1 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE @@ -21856,6 +22028,9 @@ define amdgpu_kernel void @private_workgroup_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s1 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE @@ -22157,6 +22332,9 @@ define amdgpu_kernel void @private_workgroup_one_as_monotonic_seq_cst_ret_cmpxch ; GFX1250-NEXT: v_mov_b32_e32 v4, s1 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE @@ -22458,6 +22636,9 @@ define amdgpu_kernel void @private_workgroup_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s1 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE @@ -22759,6 +22940,9 @@ define amdgpu_kernel void @private_workgroup_one_as_release_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s1 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE @@ -23060,6 +23244,9 @@ define amdgpu_kernel void @private_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s1 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE @@ -23361,6 +23548,9 @@ define amdgpu_kernel void @private_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s1 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE