From fdaccc9069203d753367ad1ec50da7ac7a601a7a Mon Sep 17 00:00:00 2001 From: Austin Kerbow Date: Thu, 29 May 2025 10:28:16 -0700 Subject: [PATCH] [AMDGPU] Handle direct loads to LDS in memory model Add additional waitcnt insertion to ensure proper ordering between LDS operations and direct loads from global memory to LDS on pre-GFX10 hardware. Direct LDS loads perform both a global memory load and an LDS store, which can be reordered with respect to other LDS operations without explicit synchronization. This can cause ordering violations even within a single thread. The change conservatively inserts vmcnt(0) waits for all sync scopes when the LDS address space is involved. Future optimizations in SIInsertWaitcnts can relax this to only wait for outstanding direct LDS loads rather than all vmcnt events. This change only affects LDS address space synchronization and preserves existing cross-address space ordering behavior. --- llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp | 17 + .../memory-legalizer-atomic-fence.ll | 80 ++ .../CodeGen/AMDGPU/branch-condition-and.ll | 4 +- .../CodeGen/AMDGPU/cf-loop-on-constant.ll | 2 +- .../CodeGen/AMDGPU/indirect-addressing-si.ll | 2 + .../kernel-vgpr-spill-mubuf-with-voffset.ll | 1 + .../CodeGen/AMDGPU/llvm.amdgcn.update.dpp.ll | 3 +- .../memory-legalizer-fence-mmra-local.ll | 180 ++-- .../CodeGen/AMDGPU/memory-legalizer-fence.ll | 140 ++- .../memory-legalizer-flat-singlethread.ll | 800 ++++++++++++--- .../AMDGPU/memory-legalizer-flat-volatile.ll | 10 +- .../AMDGPU/memory-legalizer-flat-wavefront.ll | 796 ++++++++++++--- .../AMDGPU/memory-legalizer-flat-workgroup.ll | 920 ++++++++++------- .../memory-legalizer-global-singlethread.ll | 304 +++++- .../memory-legalizer-global-volatile.ll | 6 +- .../memory-legalizer-global-wavefront.ll | 152 ++- .../memory-legalizer-global-workgroup.ll | 300 +++--- .../AMDGPU/memory-legalizer-local-agent.ll | 960 ++++++++++++------ .../memory-legalizer-local-nontemporal.ll | 9 +- .../memory-legalizer-local-singlethread.ll | 640 ++++++++++++ .../AMDGPU/memory-legalizer-local-system.ll | 960 ++++++++++++------ .../AMDGPU/memory-legalizer-local-volatile.ll | 28 +- .../memory-legalizer-local-wavefront.ll | 640 ++++++++++++ .../memory-legalizer-local-workgroup.ll | 960 ++++++++++++------ .../CodeGen/AMDGPU/memory-legalizer-local.mir | 716 ++++++++----- .../AMDGPU/memory-legalizer-region.mir | 691 ++++++++----- 26 files changed, 6891 insertions(+), 2430 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp index 56fec409d11ae..520112367e997 100644 --- a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp +++ b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp @@ -1084,6 +1084,7 @@ bool SIGfx6CacheControl::insertWait(MachineBasicBlock::iterator &MI, bool VMCnt = false; bool LGKMCnt = false; + bool DirectLDSWait = false; if ((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH)) != SIAtomicAddrSpace::NONE) { @@ -1104,6 +1105,10 @@ bool SIGfx6CacheControl::insertWait(MachineBasicBlock::iterator &MI, } if ((AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) { + // Wait for direct loads to LDS from global memory to ensure that + // LDS operations cannot be reordered with respect to global memory + // operations. + DirectLDSWait = true; switch (Scope) { case SIAtomicScope::SYSTEM: case SIAtomicScope::AGENT: @@ -1149,6 +1154,18 @@ bool SIGfx6CacheControl::insertWait(MachineBasicBlock::iterator &MI, } } + // Conservatively wait for vmcnt(0) to ensure that LDS operations and direct + // LDS loads from global memory cannot be reordered with respect to each + // other. This waitcnt can be safely optimized to wait for a higher vmcnt + // based on the number of outstanding direct LDS loads. + if (DirectLDSWait) { + unsigned WaitCntImmediate = AMDGPU::encodeWaitcnt( + IV, 0, getExpcntBitMask(IV), getLgkmcntBitMask(IV)); + BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_DIRECT_LDS_LOAD_soft)) + .addImm(WaitCntImmediate); + Changed = true; + } + if (VMCnt || LGKMCnt) { unsigned WaitCntImmediate = AMDGPU::encodeWaitcnt(IV, diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/memory-legalizer-atomic-fence.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/memory-legalizer-atomic-fence.ll index 66037615f0ba0..7f197b3580042 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/memory-legalizer-atomic-fence.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/memory-legalizer-atomic-fence.ll @@ -13,12 +13,14 @@ define amdgpu_kernel void @system_one_as_acquire() #0 { ; GFX6-LABEL: name: system_one_as_acquire ; GFX6: bb.0.entry: + ; GFX6-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952 ; GFX6-NEXT: S_WAITCNT_soft 3952 ; GFX6-NEXT: BUFFER_WBINVL1 implicit $exec ; GFX6-NEXT: S_ENDPGM 0 ; ; GFX8-LABEL: name: system_one_as_acquire ; GFX8: bb.0.entry: + ; GFX8-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952 ; GFX8-NEXT: S_WAITCNT_soft 3952 ; GFX8-NEXT: BUFFER_WBINVL1_VOL implicit $exec ; GFX8-NEXT: S_ENDPGM 0 @@ -62,11 +64,13 @@ entry: define amdgpu_kernel void @system_one_as_release() #0 { ; GFX6-LABEL: name: system_one_as_release ; GFX6: bb.0.entry: + ; GFX6-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952 ; GFX6-NEXT: S_WAITCNT_soft 3952 ; GFX6-NEXT: S_ENDPGM 0 ; ; GFX8-LABEL: name: system_one_as_release ; GFX8: bb.0.entry: + ; GFX8-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952 ; GFX8-NEXT: S_WAITCNT_soft 3952 ; GFX8-NEXT: S_ENDPGM 0 ; @@ -101,12 +105,14 @@ entry: define amdgpu_kernel void @system_one_as_acq_rel() #0 { ; GFX6-LABEL: name: system_one_as_acq_rel ; GFX6: bb.0.entry: + ; GFX6-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952 ; GFX6-NEXT: S_WAITCNT_soft 3952 ; GFX6-NEXT: BUFFER_WBINVL1 implicit $exec ; GFX6-NEXT: S_ENDPGM 0 ; ; GFX8-LABEL: name: system_one_as_acq_rel ; GFX8: bb.0.entry: + ; GFX8-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952 ; GFX8-NEXT: S_WAITCNT_soft 3952 ; GFX8-NEXT: BUFFER_WBINVL1_VOL implicit $exec ; GFX8-NEXT: S_ENDPGM 0 @@ -150,12 +156,14 @@ entry: define amdgpu_kernel void @system_one_as_seq_cst() #0 { ; GFX6-LABEL: name: system_one_as_seq_cst ; GFX6: bb.0.entry: + ; GFX6-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952 ; GFX6-NEXT: S_WAITCNT_soft 3952 ; GFX6-NEXT: BUFFER_WBINVL1 implicit $exec ; GFX6-NEXT: S_ENDPGM 0 ; ; GFX8-LABEL: name: system_one_as_seq_cst ; GFX8: bb.0.entry: + ; GFX8-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952 ; GFX8-NEXT: S_WAITCNT_soft 3952 ; GFX8-NEXT: BUFFER_WBINVL1_VOL implicit $exec ; GFX8-NEXT: S_ENDPGM 0 @@ -199,10 +207,12 @@ entry: define amdgpu_kernel void @singlethread_one_as_acquire() #0 { ; GFX6-LABEL: name: singlethread_one_as_acquire ; GFX6: bb.0.entry: + ; GFX6-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952 ; GFX6-NEXT: S_ENDPGM 0 ; ; GFX8-LABEL: name: singlethread_one_as_acquire ; GFX8: bb.0.entry: + ; GFX8-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952 ; GFX8-NEXT: S_ENDPGM 0 ; ; GFX10WGP-LABEL: name: singlethread_one_as_acquire @@ -228,10 +238,12 @@ entry: define amdgpu_kernel void @singlethread_one_as_release() #0 { ; GFX6-LABEL: name: singlethread_one_as_release ; GFX6: bb.0.entry: + ; GFX6-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952 ; GFX6-NEXT: S_ENDPGM 0 ; ; GFX8-LABEL: name: singlethread_one_as_release ; GFX8: bb.0.entry: + ; GFX8-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952 ; GFX8-NEXT: S_ENDPGM 0 ; ; GFX10WGP-LABEL: name: singlethread_one_as_release @@ -257,10 +269,12 @@ entry: define amdgpu_kernel void @singlethread_one_as_acq_rel() #0 { ; GFX6-LABEL: name: singlethread_one_as_acq_rel ; GFX6: bb.0.entry: + ; GFX6-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952 ; GFX6-NEXT: S_ENDPGM 0 ; ; GFX8-LABEL: name: singlethread_one_as_acq_rel ; GFX8: bb.0.entry: + ; GFX8-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952 ; GFX8-NEXT: S_ENDPGM 0 ; ; GFX10WGP-LABEL: name: singlethread_one_as_acq_rel @@ -286,10 +300,12 @@ entry: define amdgpu_kernel void @singlethread_one_as_seq_cst() #0 { ; GFX6-LABEL: name: singlethread_one_as_seq_cst ; GFX6: bb.0.entry: + ; GFX6-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952 ; GFX6-NEXT: S_ENDPGM 0 ; ; GFX8-LABEL: name: singlethread_one_as_seq_cst ; GFX8: bb.0.entry: + ; GFX8-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952 ; GFX8-NEXT: S_ENDPGM 0 ; ; GFX10WGP-LABEL: name: singlethread_one_as_seq_cst @@ -315,12 +331,14 @@ entry: define amdgpu_kernel void @agent_one_as_acquire() #0 { ; GFX6-LABEL: name: agent_one_as_acquire ; GFX6: bb.0.entry: + ; GFX6-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952 ; GFX6-NEXT: S_WAITCNT_soft 3952 ; GFX6-NEXT: BUFFER_WBINVL1 implicit $exec ; GFX6-NEXT: S_ENDPGM 0 ; ; GFX8-LABEL: name: agent_one_as_acquire ; GFX8: bb.0.entry: + ; GFX8-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952 ; GFX8-NEXT: S_WAITCNT_soft 3952 ; GFX8-NEXT: BUFFER_WBINVL1_VOL implicit $exec ; GFX8-NEXT: S_ENDPGM 0 @@ -364,11 +382,13 @@ entry: define amdgpu_kernel void @agent_one_as_release() #0 { ; GFX6-LABEL: name: agent_one_as_release ; GFX6: bb.0.entry: + ; GFX6-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952 ; GFX6-NEXT: S_WAITCNT_soft 3952 ; GFX6-NEXT: S_ENDPGM 0 ; ; GFX8-LABEL: name: agent_one_as_release ; GFX8: bb.0.entry: + ; GFX8-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952 ; GFX8-NEXT: S_WAITCNT_soft 3952 ; GFX8-NEXT: S_ENDPGM 0 ; @@ -403,12 +423,14 @@ entry: define amdgpu_kernel void @agent_one_as_acq_rel() #0 { ; GFX6-LABEL: name: agent_one_as_acq_rel ; GFX6: bb.0.entry: + ; GFX6-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952 ; GFX6-NEXT: S_WAITCNT_soft 3952 ; GFX6-NEXT: BUFFER_WBINVL1 implicit $exec ; GFX6-NEXT: S_ENDPGM 0 ; ; GFX8-LABEL: name: agent_one_as_acq_rel ; GFX8: bb.0.entry: + ; GFX8-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952 ; GFX8-NEXT: S_WAITCNT_soft 3952 ; GFX8-NEXT: BUFFER_WBINVL1_VOL implicit $exec ; GFX8-NEXT: S_ENDPGM 0 @@ -452,12 +474,14 @@ entry: define amdgpu_kernel void @agent_one_as_seq_cst() #0 { ; GFX6-LABEL: name: agent_one_as_seq_cst ; GFX6: bb.0.entry: + ; GFX6-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952 ; GFX6-NEXT: S_WAITCNT_soft 3952 ; GFX6-NEXT: BUFFER_WBINVL1 implicit $exec ; GFX6-NEXT: S_ENDPGM 0 ; ; GFX8-LABEL: name: agent_one_as_seq_cst ; GFX8: bb.0.entry: + ; GFX8-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952 ; GFX8-NEXT: S_WAITCNT_soft 3952 ; GFX8-NEXT: BUFFER_WBINVL1_VOL implicit $exec ; GFX8-NEXT: S_ENDPGM 0 @@ -501,10 +525,12 @@ entry: define amdgpu_kernel void @workgroup_one_as_acquire() #0 { ; GFX6-LABEL: name: workgroup_one_as_acquire ; GFX6: bb.0.entry: + ; GFX6-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952 ; GFX6-NEXT: S_ENDPGM 0 ; ; GFX8-LABEL: name: workgroup_one_as_acquire ; GFX8: bb.0.entry: + ; GFX8-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952 ; GFX8-NEXT: S_ENDPGM 0 ; ; GFX10WGP-LABEL: name: workgroup_one_as_acquire @@ -536,10 +562,12 @@ entry: define amdgpu_kernel void @workgroup_one_as_release() #0 { ; GFX6-LABEL: name: workgroup_one_as_release ; GFX6: bb.0.entry: + ; GFX6-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952 ; GFX6-NEXT: S_ENDPGM 0 ; ; GFX8-LABEL: name: workgroup_one_as_release ; GFX8: bb.0.entry: + ; GFX8-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952 ; GFX8-NEXT: S_ENDPGM 0 ; ; GFX10WGP-LABEL: name: workgroup_one_as_release @@ -569,10 +597,12 @@ entry: define amdgpu_kernel void @workgroup_one_as_acq_rel() #0 { ; GFX6-LABEL: name: workgroup_one_as_acq_rel ; GFX6: bb.0.entry: + ; GFX6-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952 ; GFX6-NEXT: S_ENDPGM 0 ; ; GFX8-LABEL: name: workgroup_one_as_acq_rel ; GFX8: bb.0.entry: + ; GFX8-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952 ; GFX8-NEXT: S_ENDPGM 0 ; ; GFX10WGP-LABEL: name: workgroup_one_as_acq_rel @@ -604,10 +634,12 @@ entry: define amdgpu_kernel void @workgroup_one_as_seq_cst() #0 { ; GFX6-LABEL: name: workgroup_one_as_seq_cst ; GFX6: bb.0.entry: + ; GFX6-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952 ; GFX6-NEXT: S_ENDPGM 0 ; ; GFX8-LABEL: name: workgroup_one_as_seq_cst ; GFX8: bb.0.entry: + ; GFX8-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952 ; GFX8-NEXT: S_ENDPGM 0 ; ; GFX10WGP-LABEL: name: workgroup_one_as_seq_cst @@ -639,10 +671,12 @@ entry: define amdgpu_kernel void @wavefront_one_as_acquire() #0 { ; GFX6-LABEL: name: wavefront_one_as_acquire ; GFX6: bb.0.entry: + ; GFX6-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952 ; GFX6-NEXT: S_ENDPGM 0 ; ; GFX8-LABEL: name: wavefront_one_as_acquire ; GFX8: bb.0.entry: + ; GFX8-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952 ; GFX8-NEXT: S_ENDPGM 0 ; ; GFX10WGP-LABEL: name: wavefront_one_as_acquire @@ -668,10 +702,12 @@ entry: define amdgpu_kernel void @wavefront_one_as_release() #0 { ; GFX6-LABEL: name: wavefront_one_as_release ; GFX6: bb.0.entry: + ; GFX6-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952 ; GFX6-NEXT: S_ENDPGM 0 ; ; GFX8-LABEL: name: wavefront_one_as_release ; GFX8: bb.0.entry: + ; GFX8-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952 ; GFX8-NEXT: S_ENDPGM 0 ; ; GFX10WGP-LABEL: name: wavefront_one_as_release @@ -697,10 +733,12 @@ entry: define amdgpu_kernel void @wavefront_one_as_acq_rel() #0 { ; GFX6-LABEL: name: wavefront_one_as_acq_rel ; GFX6: bb.0.entry: + ; GFX6-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952 ; GFX6-NEXT: S_ENDPGM 0 ; ; GFX8-LABEL: name: wavefront_one_as_acq_rel ; GFX8: bb.0.entry: + ; GFX8-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952 ; GFX8-NEXT: S_ENDPGM 0 ; ; GFX10WGP-LABEL: name: wavefront_one_as_acq_rel @@ -726,10 +764,12 @@ entry: define amdgpu_kernel void @wavefront_one_as_seq_cst() #0 { ; GFX6-LABEL: name: wavefront_one_as_seq_cst ; GFX6: bb.0.entry: + ; GFX6-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952 ; GFX6-NEXT: S_ENDPGM 0 ; ; GFX8-LABEL: name: wavefront_one_as_seq_cst ; GFX8: bb.0.entry: + ; GFX8-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952 ; GFX8-NEXT: S_ENDPGM 0 ; ; GFX10WGP-LABEL: name: wavefront_one_as_seq_cst @@ -755,12 +795,14 @@ entry: define amdgpu_kernel void @system_acquire() #0 { ; GFX6-LABEL: name: system_acquire ; GFX6: bb.0.entry: + ; GFX6-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952 ; GFX6-NEXT: S_WAITCNT_soft 112 ; GFX6-NEXT: BUFFER_WBINVL1 implicit $exec ; GFX6-NEXT: S_ENDPGM 0 ; ; GFX8-LABEL: name: system_acquire ; GFX8: bb.0.entry: + ; GFX8-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952 ; GFX8-NEXT: S_WAITCNT_soft 112 ; GFX8-NEXT: BUFFER_WBINVL1_VOL implicit $exec ; GFX8-NEXT: S_ENDPGM 0 @@ -804,11 +846,13 @@ entry: define amdgpu_kernel void @system_release() #0 { ; GFX6-LABEL: name: system_release ; GFX6: bb.0.entry: + ; GFX6-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952 ; GFX6-NEXT: S_WAITCNT_soft 112 ; GFX6-NEXT: S_ENDPGM 0 ; ; GFX8-LABEL: name: system_release ; GFX8: bb.0.entry: + ; GFX8-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952 ; GFX8-NEXT: S_WAITCNT_soft 112 ; GFX8-NEXT: S_ENDPGM 0 ; @@ -843,12 +887,14 @@ entry: define amdgpu_kernel void @system_acq_rel() #0 { ; GFX6-LABEL: name: system_acq_rel ; GFX6: bb.0.entry: + ; GFX6-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952 ; GFX6-NEXT: S_WAITCNT_soft 112 ; GFX6-NEXT: BUFFER_WBINVL1 implicit $exec ; GFX6-NEXT: S_ENDPGM 0 ; ; GFX8-LABEL: name: system_acq_rel ; GFX8: bb.0.entry: + ; GFX8-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952 ; GFX8-NEXT: S_WAITCNT_soft 112 ; GFX8-NEXT: BUFFER_WBINVL1_VOL implicit $exec ; GFX8-NEXT: S_ENDPGM 0 @@ -892,12 +938,14 @@ entry: define amdgpu_kernel void @system_seq_cst() #0 { ; GFX6-LABEL: name: system_seq_cst ; GFX6: bb.0.entry: + ; GFX6-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952 ; GFX6-NEXT: S_WAITCNT_soft 112 ; GFX6-NEXT: BUFFER_WBINVL1 implicit $exec ; GFX6-NEXT: S_ENDPGM 0 ; ; GFX8-LABEL: name: system_seq_cst ; GFX8: bb.0.entry: + ; GFX8-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952 ; GFX8-NEXT: S_WAITCNT_soft 112 ; GFX8-NEXT: BUFFER_WBINVL1_VOL implicit $exec ; GFX8-NEXT: S_ENDPGM 0 @@ -941,10 +989,12 @@ entry: define amdgpu_kernel void @singlethread_acquire() #0 { ; GFX6-LABEL: name: singlethread_acquire ; GFX6: bb.0.entry: + ; GFX6-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952 ; GFX6-NEXT: S_ENDPGM 0 ; ; GFX8-LABEL: name: singlethread_acquire ; GFX8: bb.0.entry: + ; GFX8-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952 ; GFX8-NEXT: S_ENDPGM 0 ; ; GFX10WGP-LABEL: name: singlethread_acquire @@ -970,10 +1020,12 @@ entry: define amdgpu_kernel void @singlethread_release() #0 { ; GFX6-LABEL: name: singlethread_release ; GFX6: bb.0.entry: + ; GFX6-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952 ; GFX6-NEXT: S_ENDPGM 0 ; ; GFX8-LABEL: name: singlethread_release ; GFX8: bb.0.entry: + ; GFX8-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952 ; GFX8-NEXT: S_ENDPGM 0 ; ; GFX10WGP-LABEL: name: singlethread_release @@ -999,10 +1051,12 @@ entry: define amdgpu_kernel void @singlethread_acq_rel() #0 { ; GFX6-LABEL: name: singlethread_acq_rel ; GFX6: bb.0.entry: + ; GFX6-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952 ; GFX6-NEXT: S_ENDPGM 0 ; ; GFX8-LABEL: name: singlethread_acq_rel ; GFX8: bb.0.entry: + ; GFX8-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952 ; GFX8-NEXT: S_ENDPGM 0 ; ; GFX10WGP-LABEL: name: singlethread_acq_rel @@ -1028,10 +1082,12 @@ entry: define amdgpu_kernel void @singlethread_seq_cst() #0 { ; GFX6-LABEL: name: singlethread_seq_cst ; GFX6: bb.0.entry: + ; GFX6-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952 ; GFX6-NEXT: S_ENDPGM 0 ; ; GFX8-LABEL: name: singlethread_seq_cst ; GFX8: bb.0.entry: + ; GFX8-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952 ; GFX8-NEXT: S_ENDPGM 0 ; ; GFX10WGP-LABEL: name: singlethread_seq_cst @@ -1057,12 +1113,14 @@ entry: define amdgpu_kernel void @agent_acquire() #0 { ; GFX6-LABEL: name: agent_acquire ; GFX6: bb.0.entry: + ; GFX6-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952 ; GFX6-NEXT: S_WAITCNT_soft 112 ; GFX6-NEXT: BUFFER_WBINVL1 implicit $exec ; GFX6-NEXT: S_ENDPGM 0 ; ; GFX8-LABEL: name: agent_acquire ; GFX8: bb.0.entry: + ; GFX8-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952 ; GFX8-NEXT: S_WAITCNT_soft 112 ; GFX8-NEXT: BUFFER_WBINVL1_VOL implicit $exec ; GFX8-NEXT: S_ENDPGM 0 @@ -1106,11 +1164,13 @@ entry: define amdgpu_kernel void @agent_release() #0 { ; GFX6-LABEL: name: agent_release ; GFX6: bb.0.entry: + ; GFX6-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952 ; GFX6-NEXT: S_WAITCNT_soft 112 ; GFX6-NEXT: S_ENDPGM 0 ; ; GFX8-LABEL: name: agent_release ; GFX8: bb.0.entry: + ; GFX8-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952 ; GFX8-NEXT: S_WAITCNT_soft 112 ; GFX8-NEXT: S_ENDPGM 0 ; @@ -1145,12 +1205,14 @@ entry: define amdgpu_kernel void @agent_acq_rel() #0 { ; GFX6-LABEL: name: agent_acq_rel ; GFX6: bb.0.entry: + ; GFX6-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952 ; GFX6-NEXT: S_WAITCNT_soft 112 ; GFX6-NEXT: BUFFER_WBINVL1 implicit $exec ; GFX6-NEXT: S_ENDPGM 0 ; ; GFX8-LABEL: name: agent_acq_rel ; GFX8: bb.0.entry: + ; GFX8-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952 ; GFX8-NEXT: S_WAITCNT_soft 112 ; GFX8-NEXT: BUFFER_WBINVL1_VOL implicit $exec ; GFX8-NEXT: S_ENDPGM 0 @@ -1194,12 +1256,14 @@ entry: define amdgpu_kernel void @agent_seq_cst() #0 { ; GFX6-LABEL: name: agent_seq_cst ; GFX6: bb.0.entry: + ; GFX6-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952 ; GFX6-NEXT: S_WAITCNT_soft 112 ; GFX6-NEXT: BUFFER_WBINVL1 implicit $exec ; GFX6-NEXT: S_ENDPGM 0 ; ; GFX8-LABEL: name: agent_seq_cst ; GFX8: bb.0.entry: + ; GFX8-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952 ; GFX8-NEXT: S_WAITCNT_soft 112 ; GFX8-NEXT: BUFFER_WBINVL1_VOL implicit $exec ; GFX8-NEXT: S_ENDPGM 0 @@ -1243,11 +1307,13 @@ entry: define amdgpu_kernel void @workgroup_acquire() #0 { ; GFX6-LABEL: name: workgroup_acquire ; GFX6: bb.0.entry: + ; GFX6-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952 ; GFX6-NEXT: S_WAITCNT_soft 127 ; GFX6-NEXT: S_ENDPGM 0 ; ; GFX8-LABEL: name: workgroup_acquire ; GFX8: bb.0.entry: + ; GFX8-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952 ; GFX8-NEXT: S_WAITCNT_soft 127 ; GFX8-NEXT: S_ENDPGM 0 ; @@ -1282,11 +1348,13 @@ entry: define amdgpu_kernel void @workgroup_release() #0 { ; GFX6-LABEL: name: workgroup_release ; GFX6: bb.0.entry: + ; GFX6-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952 ; GFX6-NEXT: S_WAITCNT_soft 127 ; GFX6-NEXT: S_ENDPGM 0 ; ; GFX8-LABEL: name: workgroup_release ; GFX8: bb.0.entry: + ; GFX8-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952 ; GFX8-NEXT: S_WAITCNT_soft 127 ; GFX8-NEXT: S_ENDPGM 0 ; @@ -1319,11 +1387,13 @@ entry: define amdgpu_kernel void @workgroup_acq_rel() #0 { ; GFX6-LABEL: name: workgroup_acq_rel ; GFX6: bb.0.entry: + ; GFX6-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952 ; GFX6-NEXT: S_WAITCNT_soft 127 ; GFX6-NEXT: S_ENDPGM 0 ; ; GFX8-LABEL: name: workgroup_acq_rel ; GFX8: bb.0.entry: + ; GFX8-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952 ; GFX8-NEXT: S_WAITCNT_soft 127 ; GFX8-NEXT: S_ENDPGM 0 ; @@ -1358,11 +1428,13 @@ entry: define amdgpu_kernel void @workgroup_seq_cst() #0 { ; GFX6-LABEL: name: workgroup_seq_cst ; GFX6: bb.0.entry: + ; GFX6-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952 ; GFX6-NEXT: S_WAITCNT_soft 127 ; GFX6-NEXT: S_ENDPGM 0 ; ; GFX8-LABEL: name: workgroup_seq_cst ; GFX8: bb.0.entry: + ; GFX8-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952 ; GFX8-NEXT: S_WAITCNT_soft 127 ; GFX8-NEXT: S_ENDPGM 0 ; @@ -1397,10 +1469,12 @@ entry: define amdgpu_kernel void @wavefront_acquire() #0 { ; GFX6-LABEL: name: wavefront_acquire ; GFX6: bb.0.entry: + ; GFX6-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952 ; GFX6-NEXT: S_ENDPGM 0 ; ; GFX8-LABEL: name: wavefront_acquire ; GFX8: bb.0.entry: + ; GFX8-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952 ; GFX8-NEXT: S_ENDPGM 0 ; ; GFX10WGP-LABEL: name: wavefront_acquire @@ -1426,10 +1500,12 @@ entry: define amdgpu_kernel void @wavefront_release() #0 { ; GFX6-LABEL: name: wavefront_release ; GFX6: bb.0.entry: + ; GFX6-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952 ; GFX6-NEXT: S_ENDPGM 0 ; ; GFX8-LABEL: name: wavefront_release ; GFX8: bb.0.entry: + ; GFX8-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952 ; GFX8-NEXT: S_ENDPGM 0 ; ; GFX10WGP-LABEL: name: wavefront_release @@ -1455,10 +1531,12 @@ entry: define amdgpu_kernel void @wavefront_acq_rel() #0 { ; GFX6-LABEL: name: wavefront_acq_rel ; GFX6: bb.0.entry: + ; GFX6-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952 ; GFX6-NEXT: S_ENDPGM 0 ; ; GFX8-LABEL: name: wavefront_acq_rel ; GFX8: bb.0.entry: + ; GFX8-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952 ; GFX8-NEXT: S_ENDPGM 0 ; ; GFX10WGP-LABEL: name: wavefront_acq_rel @@ -1484,10 +1562,12 @@ entry: define amdgpu_kernel void @wavefront_seq_cst() #0 { ; GFX6-LABEL: name: wavefront_seq_cst ; GFX6: bb.0.entry: + ; GFX6-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952 ; GFX6-NEXT: S_ENDPGM 0 ; ; GFX8-LABEL: name: wavefront_seq_cst ; GFX8: bb.0.entry: + ; GFX8-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952 ; GFX8-NEXT: S_ENDPGM 0 ; ; GFX10WGP-LABEL: name: wavefront_seq_cst diff --git a/llvm/test/CodeGen/AMDGPU/branch-condition-and.ll b/llvm/test/CodeGen/AMDGPU/branch-condition-and.ll index 2bf4a2c028fdc..9fd44da40453f 100644 --- a/llvm/test/CodeGen/AMDGPU/branch-condition-and.ll +++ b/llvm/test/CodeGen/AMDGPU/branch-condition-and.ll @@ -17,12 +17,14 @@ define amdgpu_ps void @ham(float %arg, float %arg1) #0 { ; GCN-NEXT: v_cmp_lt_f32_e64 s[0:1], 0, v1 ; GCN-NEXT: s_and_b64 s[0:1], vcc, s[0:1] ; GCN-NEXT: s_and_saveexec_b64 s[2:3], s[0:1] +; GCN-NEXT: s_cbranch_execz .LBB0_2 ; GCN-NEXT: ; %bb.1: ; %bb4 ; GCN-NEXT: v_mov_b32_e32 v0, 4 ; GCN-NEXT: s_mov_b32 m0, -1 ; GCN-NEXT: ds_write_b32 v0, v0 +; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: ; divergent unreachable -; GCN-NEXT: ; %bb.2: ; %UnifiedReturnBlock +; GCN-NEXT: .LBB0_2: ; %UnifiedReturnBlock ; GCN-NEXT: s_endpgm bb: %tmp = fcmp ogt float %arg, 0.000000e+00 diff --git a/llvm/test/CodeGen/AMDGPU/cf-loop-on-constant.ll b/llvm/test/CodeGen/AMDGPU/cf-loop-on-constant.ll index f78cb0daee5c9..8ee5a1ef932b9 100644 --- a/llvm/test/CodeGen/AMDGPU/cf-loop-on-constant.ll +++ b/llvm/test/CodeGen/AMDGPU/cf-loop-on-constant.ll @@ -404,7 +404,7 @@ define amdgpu_kernel void @loop_arg_0(ptr addrspace(3) %ptr, i32 %n) nounwind { ; GCN_DBG-NEXT: v_mov_b32_e32 v0, 0 ; GCN_DBG-NEXT: s_mov_b32 m0, -1 ; GCN_DBG-NEXT: ds_read_u8 v0, v0 -; GCN_DBG-NEXT: s_waitcnt lgkmcnt(0) +; GCN_DBG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN_DBG-NEXT: v_readfirstlane_b32 s0, v0 ; GCN_DBG-NEXT: s_and_b32 s0, 1, s0 ; GCN_DBG-NEXT: s_cmp_eq_u32 s0, 1 diff --git a/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll b/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll index b5665835eaf7a..8c2011b49ceb9 100644 --- a/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll +++ b/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll @@ -7807,10 +7807,12 @@ define amdgpu_kernel void @multi_same_block(i32 %arg) { ; NOOPT-NEXT: ; implicit-def: $sgpr0 ; NOOPT-NEXT: v_mov_b32_e32 v0, s0 ; NOOPT-NEXT: ds_write_b32 v0, v2 +; NOOPT-NEXT: s_waitcnt vmcnt(0) ; NOOPT-NEXT: s_mov_b32 m0, -1 ; NOOPT-NEXT: ; implicit-def: $sgpr0 ; NOOPT-NEXT: v_mov_b32_e32 v0, s0 ; NOOPT-NEXT: ds_write_b32 v0, v1 +; NOOPT-NEXT: s_waitcnt vmcnt(0) ; NOOPT-NEXT: s_endpgm ; ; SI-MOVREL-LABEL: multi_same_block: diff --git a/llvm/test/CodeGen/AMDGPU/kernel-vgpr-spill-mubuf-with-voffset.ll b/llvm/test/CodeGen/AMDGPU/kernel-vgpr-spill-mubuf-with-voffset.ll index 0681263b7428e..04e352984b948 100644 --- a/llvm/test/CodeGen/AMDGPU/kernel-vgpr-spill-mubuf-with-voffset.ll +++ b/llvm/test/CodeGen/AMDGPU/kernel-vgpr-spill-mubuf-with-voffset.ll @@ -71,6 +71,7 @@ define amdgpu_kernel void @test_kernel(i32 %val) #0 { ; CHECK-NEXT: v_mov_b32_e32 v0, s4 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: ds_write_b32 v0, v1 +; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: s_endpgm ; CHECK-NEXT: .LBB0_2: ; %end ; CHECK-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.update.dpp.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.update.dpp.ll index 44415657b6336..c4451fd0891dc 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.update.dpp.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.update.dpp.ll @@ -165,8 +165,9 @@ define weak_odr amdgpu_kernel void @dpp_test1(ptr %arg) local_unnamed_addr { ; GFX8-NOOPT-NEXT: v_lshlrev_b32_e64 v3, s0, v0 ; GFX8-NOOPT-NEXT: s_mov_b32 m0, -1 ; GFX8-NOOPT-NEXT: ds_read_b32 v0, v3 -; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NOOPT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NOOPT-NEXT: s_barrier +; GFX8-NOOPT-NEXT: s_waitcnt vmcnt(0) ; GFX8-NOOPT-NEXT: v_add_u32_e64 v1, s[0:1], v0, v0 ; GFX8-NOOPT-NEXT: v_mov_b32_e32 v0, 0 ; GFX8-NOOPT-NEXT: s_nop 1 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence-mmra-local.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence-mmra-local.ll index 971015b391ca8..445f597516e8c 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence-mmra-local.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence-mmra-local.ll @@ -16,12 +16,12 @@ define amdgpu_kernel void @workgroup_acquire_fence() { ; GFX6-LABEL: workgroup_acquire_fence: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: workgroup_acquire_fence: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: workgroup_acquire_fence: @@ -36,12 +36,12 @@ define amdgpu_kernel void @workgroup_acquire_fence() { ; ; SKIP-CACHE-INV-LABEL: workgroup_acquire_fence: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: workgroup_acquire_fence: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: workgroup_acquire_fence: @@ -50,7 +50,7 @@ define amdgpu_kernel void @workgroup_acquire_fence() { ; ; GFX942-NOTTGSPLIT-LABEL: workgroup_acquire_fence: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: workgroup_acquire_fence: @@ -84,12 +84,12 @@ entry: define amdgpu_kernel void @workgroup_release_fence() { ; GFX6-LABEL: workgroup_release_fence: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: workgroup_release_fence: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: workgroup_release_fence: @@ -104,12 +104,12 @@ define amdgpu_kernel void @workgroup_release_fence() { ; ; SKIP-CACHE-INV-LABEL: workgroup_release_fence: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: workgroup_release_fence: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: workgroup_release_fence: @@ -118,7 +118,7 @@ define amdgpu_kernel void @workgroup_release_fence() { ; ; GFX942-NOTTGSPLIT-LABEL: workgroup_release_fence: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: workgroup_release_fence: @@ -150,12 +150,12 @@ entry: define amdgpu_kernel void @workgroup_acq_rel_fence() { ; GFX6-LABEL: workgroup_acq_rel_fence: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: workgroup_acq_rel_fence: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: workgroup_acq_rel_fence: @@ -170,12 +170,12 @@ define amdgpu_kernel void @workgroup_acq_rel_fence() { ; ; SKIP-CACHE-INV-LABEL: workgroup_acq_rel_fence: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: workgroup_acq_rel_fence: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: workgroup_acq_rel_fence: @@ -184,7 +184,7 @@ define amdgpu_kernel void @workgroup_acq_rel_fence() { ; ; GFX942-NOTTGSPLIT-LABEL: workgroup_acq_rel_fence: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: workgroup_acq_rel_fence: @@ -216,12 +216,12 @@ entry: define amdgpu_kernel void @workgroup_seq_cst_fence() { ; GFX6-LABEL: workgroup_seq_cst_fence: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: workgroup_seq_cst_fence: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: workgroup_seq_cst_fence: @@ -236,12 +236,12 @@ define amdgpu_kernel void @workgroup_seq_cst_fence() { ; ; SKIP-CACHE-INV-LABEL: workgroup_seq_cst_fence: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: workgroup_seq_cst_fence: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: workgroup_seq_cst_fence: @@ -250,7 +250,7 @@ define amdgpu_kernel void @workgroup_seq_cst_fence() { ; ; GFX942-NOTTGSPLIT-LABEL: workgroup_seq_cst_fence: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: workgroup_seq_cst_fence: @@ -282,10 +282,12 @@ entry: define amdgpu_kernel void @workgroup_one_as_acquire_fence() { ; GFX6-LABEL: workgroup_one_as_acquire_fence: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: workgroup_one_as_acquire_fence: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: workgroup_one_as_acquire_fence: @@ -298,10 +300,12 @@ define amdgpu_kernel void @workgroup_one_as_acquire_fence() { ; ; SKIP-CACHE-INV-LABEL: workgroup_one_as_acquire_fence: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: workgroup_one_as_acquire_fence: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: workgroup_one_as_acquire_fence: @@ -310,6 +314,7 @@ define amdgpu_kernel void @workgroup_one_as_acquire_fence() { ; ; GFX942-NOTTGSPLIT-LABEL: workgroup_one_as_acquire_fence: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: workgroup_one_as_acquire_fence: @@ -339,10 +344,12 @@ entry: define amdgpu_kernel void @workgroup_one_as_release_fence() { ; GFX6-LABEL: workgroup_one_as_release_fence: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: workgroup_one_as_release_fence: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: workgroup_one_as_release_fence: @@ -355,10 +362,12 @@ define amdgpu_kernel void @workgroup_one_as_release_fence() { ; ; SKIP-CACHE-INV-LABEL: workgroup_one_as_release_fence: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: workgroup_one_as_release_fence: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: workgroup_one_as_release_fence: @@ -367,6 +376,7 @@ define amdgpu_kernel void @workgroup_one_as_release_fence() { ; ; GFX942-NOTTGSPLIT-LABEL: workgroup_one_as_release_fence: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: workgroup_one_as_release_fence: @@ -396,10 +406,12 @@ entry: define amdgpu_kernel void @workgroup_one_as_acq_rel_fence() { ; GFX6-LABEL: workgroup_one_as_acq_rel_fence: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: workgroup_one_as_acq_rel_fence: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: workgroup_one_as_acq_rel_fence: @@ -412,10 +424,12 @@ define amdgpu_kernel void @workgroup_one_as_acq_rel_fence() { ; ; SKIP-CACHE-INV-LABEL: workgroup_one_as_acq_rel_fence: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: workgroup_one_as_acq_rel_fence: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: workgroup_one_as_acq_rel_fence: @@ -424,6 +438,7 @@ define amdgpu_kernel void @workgroup_one_as_acq_rel_fence() { ; ; GFX942-NOTTGSPLIT-LABEL: workgroup_one_as_acq_rel_fence: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: workgroup_one_as_acq_rel_fence: @@ -453,10 +468,12 @@ entry: define amdgpu_kernel void @workgroup_one_as_seq_cst_fence() { ; GFX6-LABEL: workgroup_one_as_seq_cst_fence: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: workgroup_one_as_seq_cst_fence: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: workgroup_one_as_seq_cst_fence: @@ -469,10 +486,12 @@ define amdgpu_kernel void @workgroup_one_as_seq_cst_fence() { ; ; SKIP-CACHE-INV-LABEL: workgroup_one_as_seq_cst_fence: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: workgroup_one_as_seq_cst_fence: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: workgroup_one_as_seq_cst_fence: @@ -481,6 +500,7 @@ define amdgpu_kernel void @workgroup_one_as_seq_cst_fence() { ; ; GFX942-NOTTGSPLIT-LABEL: workgroup_one_as_seq_cst_fence: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: workgroup_one_as_seq_cst_fence: @@ -510,12 +530,12 @@ entry: define amdgpu_kernel void @agent_acquire_fence() { ; GFX6-LABEL: agent_acquire_fence: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: agent_acquire_fence: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: agent_acquire_fence: @@ -530,12 +550,12 @@ define amdgpu_kernel void @agent_acquire_fence() { ; ; SKIP-CACHE-INV-LABEL: agent_acquire_fence: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: agent_acquire_fence: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: agent_acquire_fence: @@ -544,7 +564,7 @@ define amdgpu_kernel void @agent_acquire_fence() { ; ; GFX942-NOTTGSPLIT-LABEL: agent_acquire_fence: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: agent_acquire_fence: @@ -578,12 +598,12 @@ entry: define amdgpu_kernel void @agent_release_fence() { ; GFX6-LABEL: agent_release_fence: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: agent_release_fence: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: agent_release_fence: @@ -598,12 +618,12 @@ define amdgpu_kernel void @agent_release_fence() { ; ; SKIP-CACHE-INV-LABEL: agent_release_fence: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: agent_release_fence: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: agent_release_fence: @@ -612,7 +632,7 @@ define amdgpu_kernel void @agent_release_fence() { ; ; GFX942-NOTTGSPLIT-LABEL: agent_release_fence: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: agent_release_fence: @@ -644,12 +664,12 @@ entry: define amdgpu_kernel void @agent_acq_rel_fence() { ; GFX6-LABEL: agent_acq_rel_fence: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: agent_acq_rel_fence: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: agent_acq_rel_fence: @@ -664,12 +684,12 @@ define amdgpu_kernel void @agent_acq_rel_fence() { ; ; SKIP-CACHE-INV-LABEL: agent_acq_rel_fence: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: agent_acq_rel_fence: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: agent_acq_rel_fence: @@ -678,7 +698,7 @@ define amdgpu_kernel void @agent_acq_rel_fence() { ; ; GFX942-NOTTGSPLIT-LABEL: agent_acq_rel_fence: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: agent_acq_rel_fence: @@ -710,12 +730,12 @@ entry: define amdgpu_kernel void @agent_seq_cst_fence() { ; GFX6-LABEL: agent_seq_cst_fence: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: agent_seq_cst_fence: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: agent_seq_cst_fence: @@ -730,12 +750,12 @@ define amdgpu_kernel void @agent_seq_cst_fence() { ; ; SKIP-CACHE-INV-LABEL: agent_seq_cst_fence: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: agent_seq_cst_fence: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: agent_seq_cst_fence: @@ -744,7 +764,7 @@ define amdgpu_kernel void @agent_seq_cst_fence() { ; ; GFX942-NOTTGSPLIT-LABEL: agent_seq_cst_fence: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: agent_seq_cst_fence: @@ -776,10 +796,12 @@ entry: define amdgpu_kernel void @agent_one_as_acquire_fence() { ; GFX6-LABEL: agent_one_as_acquire_fence: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: agent_one_as_acquire_fence: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: agent_one_as_acquire_fence: @@ -792,10 +814,12 @@ define amdgpu_kernel void @agent_one_as_acquire_fence() { ; ; SKIP-CACHE-INV-LABEL: agent_one_as_acquire_fence: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: agent_one_as_acquire_fence: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: agent_one_as_acquire_fence: @@ -804,6 +828,7 @@ define amdgpu_kernel void @agent_one_as_acquire_fence() { ; ; GFX942-NOTTGSPLIT-LABEL: agent_one_as_acquire_fence: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: agent_one_as_acquire_fence: @@ -833,10 +858,12 @@ entry: define amdgpu_kernel void @agent_one_as_release_fence() { ; GFX6-LABEL: agent_one_as_release_fence: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: agent_one_as_release_fence: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: agent_one_as_release_fence: @@ -849,10 +876,12 @@ define amdgpu_kernel void @agent_one_as_release_fence() { ; ; SKIP-CACHE-INV-LABEL: agent_one_as_release_fence: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: agent_one_as_release_fence: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: agent_one_as_release_fence: @@ -861,6 +890,7 @@ define amdgpu_kernel void @agent_one_as_release_fence() { ; ; GFX942-NOTTGSPLIT-LABEL: agent_one_as_release_fence: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: agent_one_as_release_fence: @@ -890,10 +920,12 @@ entry: define amdgpu_kernel void @agent_one_as_acq_rel_fence() { ; GFX6-LABEL: agent_one_as_acq_rel_fence: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: agent_one_as_acq_rel_fence: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: agent_one_as_acq_rel_fence: @@ -906,10 +938,12 @@ define amdgpu_kernel void @agent_one_as_acq_rel_fence() { ; ; SKIP-CACHE-INV-LABEL: agent_one_as_acq_rel_fence: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: agent_one_as_acq_rel_fence: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: agent_one_as_acq_rel_fence: @@ -918,6 +952,7 @@ define amdgpu_kernel void @agent_one_as_acq_rel_fence() { ; ; GFX942-NOTTGSPLIT-LABEL: agent_one_as_acq_rel_fence: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: agent_one_as_acq_rel_fence: @@ -947,10 +982,12 @@ entry: define amdgpu_kernel void @agent_one_as_seq_cst_fence() { ; GFX6-LABEL: agent_one_as_seq_cst_fence: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: agent_one_as_seq_cst_fence: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: agent_one_as_seq_cst_fence: @@ -963,10 +1000,12 @@ define amdgpu_kernel void @agent_one_as_seq_cst_fence() { ; ; SKIP-CACHE-INV-LABEL: agent_one_as_seq_cst_fence: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: agent_one_as_seq_cst_fence: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: agent_one_as_seq_cst_fence: @@ -975,6 +1014,7 @@ define amdgpu_kernel void @agent_one_as_seq_cst_fence() { ; ; GFX942-NOTTGSPLIT-LABEL: agent_one_as_seq_cst_fence: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: agent_one_as_seq_cst_fence: @@ -1004,12 +1044,12 @@ entry: define amdgpu_kernel void @system_acquire_fence() { ; GFX6-LABEL: system_acquire_fence: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: system_acquire_fence: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: system_acquire_fence: @@ -1024,12 +1064,12 @@ define amdgpu_kernel void @system_acquire_fence() { ; ; SKIP-CACHE-INV-LABEL: system_acquire_fence: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: system_acquire_fence: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: system_acquire_fence: @@ -1038,7 +1078,7 @@ define amdgpu_kernel void @system_acquire_fence() { ; ; GFX942-NOTTGSPLIT-LABEL: system_acquire_fence: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: system_acquire_fence: @@ -1072,12 +1112,12 @@ entry: define amdgpu_kernel void @system_release_fence() { ; GFX6-LABEL: system_release_fence: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: system_release_fence: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: system_release_fence: @@ -1092,12 +1132,12 @@ define amdgpu_kernel void @system_release_fence() { ; ; SKIP-CACHE-INV-LABEL: system_release_fence: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: system_release_fence: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: system_release_fence: @@ -1106,7 +1146,7 @@ define amdgpu_kernel void @system_release_fence() { ; ; GFX942-NOTTGSPLIT-LABEL: system_release_fence: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: system_release_fence: @@ -1138,12 +1178,12 @@ entry: define amdgpu_kernel void @system_acq_rel_fence() { ; GFX6-LABEL: system_acq_rel_fence: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: system_acq_rel_fence: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: system_acq_rel_fence: @@ -1158,12 +1198,12 @@ define amdgpu_kernel void @system_acq_rel_fence() { ; ; SKIP-CACHE-INV-LABEL: system_acq_rel_fence: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: system_acq_rel_fence: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: system_acq_rel_fence: @@ -1172,7 +1212,7 @@ define amdgpu_kernel void @system_acq_rel_fence() { ; ; GFX942-NOTTGSPLIT-LABEL: system_acq_rel_fence: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: system_acq_rel_fence: @@ -1204,12 +1244,12 @@ entry: define amdgpu_kernel void @system_seq_cst_fence() { ; GFX6-LABEL: system_seq_cst_fence: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: system_seq_cst_fence: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: system_seq_cst_fence: @@ -1224,12 +1264,12 @@ define amdgpu_kernel void @system_seq_cst_fence() { ; ; SKIP-CACHE-INV-LABEL: system_seq_cst_fence: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: system_seq_cst_fence: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: system_seq_cst_fence: @@ -1238,7 +1278,7 @@ define amdgpu_kernel void @system_seq_cst_fence() { ; ; GFX942-NOTTGSPLIT-LABEL: system_seq_cst_fence: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: system_seq_cst_fence: @@ -1270,10 +1310,12 @@ entry: define amdgpu_kernel void @system_one_as_acquire_fence() { ; GFX6-LABEL: system_one_as_acquire_fence: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: system_one_as_acquire_fence: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: system_one_as_acquire_fence: @@ -1286,10 +1328,12 @@ define amdgpu_kernel void @system_one_as_acquire_fence() { ; ; SKIP-CACHE-INV-LABEL: system_one_as_acquire_fence: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: system_one_as_acquire_fence: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: system_one_as_acquire_fence: @@ -1298,6 +1342,7 @@ define amdgpu_kernel void @system_one_as_acquire_fence() { ; ; GFX942-NOTTGSPLIT-LABEL: system_one_as_acquire_fence: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: system_one_as_acquire_fence: @@ -1327,10 +1372,12 @@ entry: define amdgpu_kernel void @system_one_as_release_fence() { ; GFX6-LABEL: system_one_as_release_fence: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: system_one_as_release_fence: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: system_one_as_release_fence: @@ -1343,10 +1390,12 @@ define amdgpu_kernel void @system_one_as_release_fence() { ; ; SKIP-CACHE-INV-LABEL: system_one_as_release_fence: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: system_one_as_release_fence: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: system_one_as_release_fence: @@ -1355,6 +1404,7 @@ define amdgpu_kernel void @system_one_as_release_fence() { ; ; GFX942-NOTTGSPLIT-LABEL: system_one_as_release_fence: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: system_one_as_release_fence: @@ -1384,10 +1434,12 @@ entry: define amdgpu_kernel void @system_one_as_acq_rel_fence() { ; GFX6-LABEL: system_one_as_acq_rel_fence: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: system_one_as_acq_rel_fence: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: system_one_as_acq_rel_fence: @@ -1400,10 +1452,12 @@ define amdgpu_kernel void @system_one_as_acq_rel_fence() { ; ; SKIP-CACHE-INV-LABEL: system_one_as_acq_rel_fence: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: system_one_as_acq_rel_fence: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: system_one_as_acq_rel_fence: @@ -1412,6 +1466,7 @@ define amdgpu_kernel void @system_one_as_acq_rel_fence() { ; ; GFX942-NOTTGSPLIT-LABEL: system_one_as_acq_rel_fence: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: system_one_as_acq_rel_fence: @@ -1441,10 +1496,12 @@ entry: define amdgpu_kernel void @system_one_as_seq_cst_fence() { ; GFX6-LABEL: system_one_as_seq_cst_fence: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: system_one_as_seq_cst_fence: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: system_one_as_seq_cst_fence: @@ -1457,10 +1514,12 @@ define amdgpu_kernel void @system_one_as_seq_cst_fence() { ; ; SKIP-CACHE-INV-LABEL: system_one_as_seq_cst_fence: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: system_one_as_seq_cst_fence: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: system_one_as_seq_cst_fence: @@ -1469,6 +1528,7 @@ define amdgpu_kernel void @system_one_as_seq_cst_fence() { ; ; GFX942-NOTTGSPLIT-LABEL: system_one_as_seq_cst_fence: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: system_one_as_seq_cst_fence: diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence.ll index 0e459ed0f1243..0a68ec2bfa1b9 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence.ll @@ -16,10 +16,12 @@ define amdgpu_kernel void @singlethread_acquire_fence() { ; GFX6-LABEL: singlethread_acquire_fence: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: singlethread_acquire_fence: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: singlethread_acquire_fence: @@ -32,10 +34,12 @@ define amdgpu_kernel void @singlethread_acquire_fence() { ; ; SKIP-CACHE-INV-LABEL: singlethread_acquire_fence: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: singlethread_acquire_fence: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: singlethread_acquire_fence: @@ -44,6 +48,7 @@ define amdgpu_kernel void @singlethread_acquire_fence() { ; ; GFX942-NOTTGSPLIT-LABEL: singlethread_acquire_fence: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: singlethread_acquire_fence: @@ -73,10 +78,12 @@ entry: define amdgpu_kernel void @singlethread_release_fence() { ; GFX6-LABEL: singlethread_release_fence: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: singlethread_release_fence: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: singlethread_release_fence: @@ -89,10 +96,12 @@ define amdgpu_kernel void @singlethread_release_fence() { ; ; SKIP-CACHE-INV-LABEL: singlethread_release_fence: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: singlethread_release_fence: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: singlethread_release_fence: @@ -101,6 +110,7 @@ define amdgpu_kernel void @singlethread_release_fence() { ; ; GFX942-NOTTGSPLIT-LABEL: singlethread_release_fence: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: singlethread_release_fence: @@ -130,10 +140,12 @@ entry: define amdgpu_kernel void @singlethread_acq_rel_fence() { ; GFX6-LABEL: singlethread_acq_rel_fence: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: singlethread_acq_rel_fence: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: singlethread_acq_rel_fence: @@ -146,10 +158,12 @@ define amdgpu_kernel void @singlethread_acq_rel_fence() { ; ; SKIP-CACHE-INV-LABEL: singlethread_acq_rel_fence: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: singlethread_acq_rel_fence: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: singlethread_acq_rel_fence: @@ -158,6 +172,7 @@ define amdgpu_kernel void @singlethread_acq_rel_fence() { ; ; GFX942-NOTTGSPLIT-LABEL: singlethread_acq_rel_fence: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: singlethread_acq_rel_fence: @@ -187,10 +202,12 @@ entry: define amdgpu_kernel void @singlethread_seq_cst_fence() { ; GFX6-LABEL: singlethread_seq_cst_fence: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: singlethread_seq_cst_fence: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: singlethread_seq_cst_fence: @@ -203,10 +220,12 @@ define amdgpu_kernel void @singlethread_seq_cst_fence() { ; ; SKIP-CACHE-INV-LABEL: singlethread_seq_cst_fence: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: singlethread_seq_cst_fence: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: singlethread_seq_cst_fence: @@ -215,6 +234,7 @@ define amdgpu_kernel void @singlethread_seq_cst_fence() { ; ; GFX942-NOTTGSPLIT-LABEL: singlethread_seq_cst_fence: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: singlethread_seq_cst_fence: @@ -244,10 +264,12 @@ entry: define amdgpu_kernel void @singlethread_one_as_acquire_fence() { ; GFX6-LABEL: singlethread_one_as_acquire_fence: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: singlethread_one_as_acquire_fence: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: singlethread_one_as_acquire_fence: @@ -260,10 +282,12 @@ define amdgpu_kernel void @singlethread_one_as_acquire_fence() { ; ; SKIP-CACHE-INV-LABEL: singlethread_one_as_acquire_fence: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: singlethread_one_as_acquire_fence: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: singlethread_one_as_acquire_fence: @@ -272,6 +296,7 @@ define amdgpu_kernel void @singlethread_one_as_acquire_fence() { ; ; GFX942-NOTTGSPLIT-LABEL: singlethread_one_as_acquire_fence: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: singlethread_one_as_acquire_fence: @@ -301,10 +326,12 @@ entry: define amdgpu_kernel void @singlethread_one_as_release_fence() { ; GFX6-LABEL: singlethread_one_as_release_fence: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: singlethread_one_as_release_fence: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: singlethread_one_as_release_fence: @@ -317,10 +344,12 @@ define amdgpu_kernel void @singlethread_one_as_release_fence() { ; ; SKIP-CACHE-INV-LABEL: singlethread_one_as_release_fence: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: singlethread_one_as_release_fence: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: singlethread_one_as_release_fence: @@ -329,6 +358,7 @@ define amdgpu_kernel void @singlethread_one_as_release_fence() { ; ; GFX942-NOTTGSPLIT-LABEL: singlethread_one_as_release_fence: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: singlethread_one_as_release_fence: @@ -358,10 +388,12 @@ entry: define amdgpu_kernel void @singlethread_one_as_acq_rel_fence() { ; GFX6-LABEL: singlethread_one_as_acq_rel_fence: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: singlethread_one_as_acq_rel_fence: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: singlethread_one_as_acq_rel_fence: @@ -374,10 +406,12 @@ define amdgpu_kernel void @singlethread_one_as_acq_rel_fence() { ; ; SKIP-CACHE-INV-LABEL: singlethread_one_as_acq_rel_fence: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: singlethread_one_as_acq_rel_fence: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: singlethread_one_as_acq_rel_fence: @@ -386,6 +420,7 @@ define amdgpu_kernel void @singlethread_one_as_acq_rel_fence() { ; ; GFX942-NOTTGSPLIT-LABEL: singlethread_one_as_acq_rel_fence: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: singlethread_one_as_acq_rel_fence: @@ -415,10 +450,12 @@ entry: define amdgpu_kernel void @singlethread_one_as_seq_cst_fence() { ; GFX6-LABEL: singlethread_one_as_seq_cst_fence: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: singlethread_one_as_seq_cst_fence: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: singlethread_one_as_seq_cst_fence: @@ -431,10 +468,12 @@ define amdgpu_kernel void @singlethread_one_as_seq_cst_fence() { ; ; SKIP-CACHE-INV-LABEL: singlethread_one_as_seq_cst_fence: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: singlethread_one_as_seq_cst_fence: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: singlethread_one_as_seq_cst_fence: @@ -443,6 +482,7 @@ define amdgpu_kernel void @singlethread_one_as_seq_cst_fence() { ; ; GFX942-NOTTGSPLIT-LABEL: singlethread_one_as_seq_cst_fence: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: singlethread_one_as_seq_cst_fence: @@ -472,10 +512,12 @@ entry: define amdgpu_kernel void @wavefront_acquire_fence() { ; GFX6-LABEL: wavefront_acquire_fence: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: wavefront_acquire_fence: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: wavefront_acquire_fence: @@ -488,10 +530,12 @@ define amdgpu_kernel void @wavefront_acquire_fence() { ; ; SKIP-CACHE-INV-LABEL: wavefront_acquire_fence: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: wavefront_acquire_fence: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: wavefront_acquire_fence: @@ -500,6 +544,7 @@ define amdgpu_kernel void @wavefront_acquire_fence() { ; ; GFX942-NOTTGSPLIT-LABEL: wavefront_acquire_fence: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: wavefront_acquire_fence: @@ -529,10 +574,12 @@ entry: define amdgpu_kernel void @wavefront_release_fence() { ; GFX6-LABEL: wavefront_release_fence: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: wavefront_release_fence: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: wavefront_release_fence: @@ -545,10 +592,12 @@ define amdgpu_kernel void @wavefront_release_fence() { ; ; SKIP-CACHE-INV-LABEL: wavefront_release_fence: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: wavefront_release_fence: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: wavefront_release_fence: @@ -557,6 +606,7 @@ define amdgpu_kernel void @wavefront_release_fence() { ; ; GFX942-NOTTGSPLIT-LABEL: wavefront_release_fence: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: wavefront_release_fence: @@ -586,10 +636,12 @@ entry: define amdgpu_kernel void @wavefront_acq_rel_fence() { ; GFX6-LABEL: wavefront_acq_rel_fence: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: wavefront_acq_rel_fence: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: wavefront_acq_rel_fence: @@ -602,10 +654,12 @@ define amdgpu_kernel void @wavefront_acq_rel_fence() { ; ; SKIP-CACHE-INV-LABEL: wavefront_acq_rel_fence: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: wavefront_acq_rel_fence: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: wavefront_acq_rel_fence: @@ -614,6 +668,7 @@ define amdgpu_kernel void @wavefront_acq_rel_fence() { ; ; GFX942-NOTTGSPLIT-LABEL: wavefront_acq_rel_fence: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: wavefront_acq_rel_fence: @@ -643,10 +698,12 @@ entry: define amdgpu_kernel void @wavefront_seq_cst_fence() { ; GFX6-LABEL: wavefront_seq_cst_fence: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: wavefront_seq_cst_fence: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: wavefront_seq_cst_fence: @@ -659,10 +716,12 @@ define amdgpu_kernel void @wavefront_seq_cst_fence() { ; ; SKIP-CACHE-INV-LABEL: wavefront_seq_cst_fence: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: wavefront_seq_cst_fence: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: wavefront_seq_cst_fence: @@ -671,6 +730,7 @@ define amdgpu_kernel void @wavefront_seq_cst_fence() { ; ; GFX942-NOTTGSPLIT-LABEL: wavefront_seq_cst_fence: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: wavefront_seq_cst_fence: @@ -700,10 +760,12 @@ entry: define amdgpu_kernel void @wavefront_one_as_acquire_fence() { ; GFX6-LABEL: wavefront_one_as_acquire_fence: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: wavefront_one_as_acquire_fence: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: wavefront_one_as_acquire_fence: @@ -716,10 +778,12 @@ define amdgpu_kernel void @wavefront_one_as_acquire_fence() { ; ; SKIP-CACHE-INV-LABEL: wavefront_one_as_acquire_fence: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: wavefront_one_as_acquire_fence: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: wavefront_one_as_acquire_fence: @@ -728,6 +792,7 @@ define amdgpu_kernel void @wavefront_one_as_acquire_fence() { ; ; GFX942-NOTTGSPLIT-LABEL: wavefront_one_as_acquire_fence: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: wavefront_one_as_acquire_fence: @@ -757,10 +822,12 @@ entry: define amdgpu_kernel void @wavefront_one_as_release_fence() { ; GFX6-LABEL: wavefront_one_as_release_fence: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: wavefront_one_as_release_fence: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: wavefront_one_as_release_fence: @@ -773,10 +840,12 @@ define amdgpu_kernel void @wavefront_one_as_release_fence() { ; ; SKIP-CACHE-INV-LABEL: wavefront_one_as_release_fence: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: wavefront_one_as_release_fence: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: wavefront_one_as_release_fence: @@ -785,6 +854,7 @@ define amdgpu_kernel void @wavefront_one_as_release_fence() { ; ; GFX942-NOTTGSPLIT-LABEL: wavefront_one_as_release_fence: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: wavefront_one_as_release_fence: @@ -814,10 +884,12 @@ entry: define amdgpu_kernel void @wavefront_one_as_acq_rel_fence() { ; GFX6-LABEL: wavefront_one_as_acq_rel_fence: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: wavefront_one_as_acq_rel_fence: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: wavefront_one_as_acq_rel_fence: @@ -830,10 +902,12 @@ define amdgpu_kernel void @wavefront_one_as_acq_rel_fence() { ; ; SKIP-CACHE-INV-LABEL: wavefront_one_as_acq_rel_fence: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: wavefront_one_as_acq_rel_fence: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: wavefront_one_as_acq_rel_fence: @@ -842,6 +916,7 @@ define amdgpu_kernel void @wavefront_one_as_acq_rel_fence() { ; ; GFX942-NOTTGSPLIT-LABEL: wavefront_one_as_acq_rel_fence: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: wavefront_one_as_acq_rel_fence: @@ -871,10 +946,12 @@ entry: define amdgpu_kernel void @wavefront_one_as_seq_cst_fence() { ; GFX6-LABEL: wavefront_one_as_seq_cst_fence: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: wavefront_one_as_seq_cst_fence: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: wavefront_one_as_seq_cst_fence: @@ -887,10 +964,12 @@ define amdgpu_kernel void @wavefront_one_as_seq_cst_fence() { ; ; SKIP-CACHE-INV-LABEL: wavefront_one_as_seq_cst_fence: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: wavefront_one_as_seq_cst_fence: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: wavefront_one_as_seq_cst_fence: @@ -899,6 +978,7 @@ define amdgpu_kernel void @wavefront_one_as_seq_cst_fence() { ; ; GFX942-NOTTGSPLIT-LABEL: wavefront_one_as_seq_cst_fence: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: wavefront_one_as_seq_cst_fence: @@ -928,12 +1008,12 @@ entry: define amdgpu_kernel void @workgroup_acquire_fence() { ; GFX6-LABEL: workgroup_acquire_fence: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: workgroup_acquire_fence: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: workgroup_acquire_fence: @@ -950,12 +1030,12 @@ define amdgpu_kernel void @workgroup_acquire_fence() { ; ; SKIP-CACHE-INV-LABEL: workgroup_acquire_fence: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: workgroup_acquire_fence: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: workgroup_acquire_fence: @@ -966,7 +1046,7 @@ define amdgpu_kernel void @workgroup_acquire_fence() { ; ; GFX942-NOTTGSPLIT-LABEL: workgroup_acquire_fence: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: workgroup_acquire_fence: @@ -1006,12 +1086,12 @@ entry: define amdgpu_kernel void @workgroup_release_fence() { ; GFX6-LABEL: workgroup_release_fence: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: workgroup_release_fence: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: workgroup_release_fence: @@ -1027,12 +1107,12 @@ define amdgpu_kernel void @workgroup_release_fence() { ; ; SKIP-CACHE-INV-LABEL: workgroup_release_fence: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: workgroup_release_fence: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: workgroup_release_fence: @@ -1042,7 +1122,7 @@ define amdgpu_kernel void @workgroup_release_fence() { ; ; GFX942-NOTTGSPLIT-LABEL: workgroup_release_fence: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: workgroup_release_fence: @@ -1081,12 +1161,12 @@ entry: define amdgpu_kernel void @workgroup_acq_rel_fence() { ; GFX6-LABEL: workgroup_acq_rel_fence: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: workgroup_acq_rel_fence: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: workgroup_acq_rel_fence: @@ -1103,12 +1183,12 @@ define amdgpu_kernel void @workgroup_acq_rel_fence() { ; ; SKIP-CACHE-INV-LABEL: workgroup_acq_rel_fence: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: workgroup_acq_rel_fence: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: workgroup_acq_rel_fence: @@ -1119,7 +1199,7 @@ define amdgpu_kernel void @workgroup_acq_rel_fence() { ; ; GFX942-NOTTGSPLIT-LABEL: workgroup_acq_rel_fence: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: workgroup_acq_rel_fence: @@ -1161,12 +1241,12 @@ entry: define amdgpu_kernel void @workgroup_seq_cst_fence() { ; GFX6-LABEL: workgroup_seq_cst_fence: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: workgroup_seq_cst_fence: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: workgroup_seq_cst_fence: @@ -1183,12 +1263,12 @@ define amdgpu_kernel void @workgroup_seq_cst_fence() { ; ; SKIP-CACHE-INV-LABEL: workgroup_seq_cst_fence: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: workgroup_seq_cst_fence: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: workgroup_seq_cst_fence: @@ -1199,7 +1279,7 @@ define amdgpu_kernel void @workgroup_seq_cst_fence() { ; ; GFX942-NOTTGSPLIT-LABEL: workgroup_seq_cst_fence: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: workgroup_seq_cst_fence: @@ -1241,10 +1321,12 @@ entry: define amdgpu_kernel void @workgroup_one_as_acquire_fence() { ; GFX6-LABEL: workgroup_one_as_acquire_fence: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: workgroup_one_as_acquire_fence: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: workgroup_one_as_acquire_fence: @@ -1260,10 +1342,12 @@ define amdgpu_kernel void @workgroup_one_as_acquire_fence() { ; ; SKIP-CACHE-INV-LABEL: workgroup_one_as_acquire_fence: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: workgroup_one_as_acquire_fence: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: workgroup_one_as_acquire_fence: @@ -1274,6 +1358,7 @@ define amdgpu_kernel void @workgroup_one_as_acquire_fence() { ; ; GFX942-NOTTGSPLIT-LABEL: workgroup_one_as_acquire_fence: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: workgroup_one_as_acquire_fence: @@ -1311,10 +1396,12 @@ entry: define amdgpu_kernel void @workgroup_one_as_release_fence() { ; GFX6-LABEL: workgroup_one_as_release_fence: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: workgroup_one_as_release_fence: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: workgroup_one_as_release_fence: @@ -1329,10 +1416,12 @@ define amdgpu_kernel void @workgroup_one_as_release_fence() { ; ; SKIP-CACHE-INV-LABEL: workgroup_one_as_release_fence: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: workgroup_one_as_release_fence: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: workgroup_one_as_release_fence: @@ -1342,6 +1431,7 @@ define amdgpu_kernel void @workgroup_one_as_release_fence() { ; ; GFX942-NOTTGSPLIT-LABEL: workgroup_one_as_release_fence: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: workgroup_one_as_release_fence: @@ -1378,10 +1468,12 @@ entry: define amdgpu_kernel void @workgroup_one_as_acq_rel_fence() { ; GFX6-LABEL: workgroup_one_as_acq_rel_fence: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: workgroup_one_as_acq_rel_fence: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: workgroup_one_as_acq_rel_fence: @@ -1397,10 +1489,12 @@ define amdgpu_kernel void @workgroup_one_as_acq_rel_fence() { ; ; SKIP-CACHE-INV-LABEL: workgroup_one_as_acq_rel_fence: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: workgroup_one_as_acq_rel_fence: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: workgroup_one_as_acq_rel_fence: @@ -1411,6 +1505,7 @@ define amdgpu_kernel void @workgroup_one_as_acq_rel_fence() { ; ; GFX942-NOTTGSPLIT-LABEL: workgroup_one_as_acq_rel_fence: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: workgroup_one_as_acq_rel_fence: @@ -1450,10 +1545,12 @@ entry: define amdgpu_kernel void @workgroup_one_as_seq_cst_fence() { ; GFX6-LABEL: workgroup_one_as_seq_cst_fence: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: workgroup_one_as_seq_cst_fence: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: workgroup_one_as_seq_cst_fence: @@ -1469,10 +1566,12 @@ define amdgpu_kernel void @workgroup_one_as_seq_cst_fence() { ; ; SKIP-CACHE-INV-LABEL: workgroup_one_as_seq_cst_fence: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: workgroup_one_as_seq_cst_fence: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: workgroup_one_as_seq_cst_fence: @@ -1483,6 +1582,7 @@ define amdgpu_kernel void @workgroup_one_as_seq_cst_fence() { ; ; GFX942-NOTTGSPLIT-LABEL: workgroup_one_as_seq_cst_fence: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: workgroup_one_as_seq_cst_fence: diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-singlethread.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-singlethread.ll index b88a10ab24a98..7e243ad064f7c 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-singlethread.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-singlethread.ll @@ -388,9 +388,10 @@ define amdgpu_kernel void @flat_singlethread_acquire_load( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: flat_load_dword v2, v[0:1] +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -438,9 +439,10 @@ define amdgpu_kernel void @flat_singlethread_acquire_load( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: flat_load_dword v2, v[0:1] +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -453,8 +455,9 @@ define amdgpu_kernel void @flat_singlethread_acquire_load( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -479,8 +482,9 @@ define amdgpu_kernel void @flat_singlethread_acquire_load( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX942-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -569,10 +573,12 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_load( ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_load_dword v2, v[0:1] +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -619,10 +625,12 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_load( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_load_dword v2, v[0:1] +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -634,9 +642,11 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_load( ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -660,9 +670,11 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_load( ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -1050,6 +1062,7 @@ define amdgpu_kernel void @flat_singlethread_release_store( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -1091,6 +1104,7 @@ define amdgpu_kernel void @flat_singlethread_release_store( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -1103,6 +1117,7 @@ define amdgpu_kernel void @flat_singlethread_release_store( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -1125,6 +1140,7 @@ define amdgpu_kernel void @flat_singlethread_release_store( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -1199,6 +1215,7 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_store( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -1240,6 +1257,7 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_store( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -1252,6 +1270,7 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_store( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -1274,6 +1293,7 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_store( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -1498,6 +1518,7 @@ define amdgpu_kernel void @flat_singlethread_acquire_atomicrmw( ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_singlethread_acquire_atomicrmw: @@ -1539,6 +1560,7 @@ define amdgpu_kernel void @flat_singlethread_acquire_atomicrmw( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acquire_atomicrmw: @@ -1551,6 +1573,7 @@ define amdgpu_kernel void @flat_singlethread_acquire_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_acquire_atomicrmw: @@ -1573,6 +1596,7 @@ define amdgpu_kernel void @flat_singlethread_acquire_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_singlethread_acquire_atomicrmw: @@ -1646,6 +1670,7 @@ define amdgpu_kernel void @flat_singlethread_release_atomicrmw( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_swap v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -1687,6 +1712,7 @@ define amdgpu_kernel void @flat_singlethread_release_atomicrmw( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -1699,6 +1725,7 @@ define amdgpu_kernel void @flat_singlethread_release_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -1721,6 +1748,7 @@ define amdgpu_kernel void @flat_singlethread_release_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -1795,7 +1823,9 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_atomicrmw( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_singlethread_acq_rel_atomicrmw: @@ -1836,7 +1866,9 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_atomicrmw( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acq_rel_atomicrmw: @@ -1848,7 +1880,9 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_acq_rel_atomicrmw: @@ -1870,7 +1904,9 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_singlethread_acq_rel_atomicrmw: @@ -1944,7 +1980,9 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_atomicrmw( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_singlethread_seq_cst_atomicrmw: @@ -1985,7 +2023,9 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_atomicrmw( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_atomicrmw: @@ -1997,7 +2037,9 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_seq_cst_atomicrmw: @@ -2019,7 +2061,9 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_singlethread_seq_cst_atomicrmw: @@ -2094,9 +2138,10 @@ define amdgpu_kernel void @flat_singlethread_acquire_ret_atomicrmw( ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s6 ; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -2147,9 +2192,10 @@ define amdgpu_kernel void @flat_singlethread_acquire_ret_atomicrmw( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 ; SKIP-CACHE-INV-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -2163,8 +2209,9 @@ define amdgpu_kernel void @flat_singlethread_acquire_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s6 ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -2191,8 +2238,9 @@ define amdgpu_kernel void @flat_singlethread_acquire_ret_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -2287,10 +2335,12 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_ret_atomicrmw( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s6 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -2340,10 +2390,12 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_ret_atomicrmw( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -2356,9 +2408,11 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -2384,9 +2438,11 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_ret_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -2481,10 +2537,12 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_ret_atomicrmw( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s6 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -2534,10 +2592,12 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_ret_atomicrmw( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -2550,9 +2610,11 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -2578,9 +2640,11 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_ret_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -2928,6 +2992,7 @@ define amdgpu_kernel void @flat_singlethread_acquire_monotonic_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_singlethread_acquire_monotonic_cmpxchg: @@ -3011,6 +3076,7 @@ define amdgpu_kernel void @flat_singlethread_acquire_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acquire_monotonic_cmpxchg: @@ -3027,6 +3093,7 @@ define amdgpu_kernel void @flat_singlethread_acquire_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_acquire_monotonic_cmpxchg: @@ -3057,6 +3124,7 @@ define amdgpu_kernel void @flat_singlethread_acquire_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_singlethread_acquire_monotonic_cmpxchg: @@ -3165,6 +3233,7 @@ define amdgpu_kernel void @flat_singlethread_release_monotonic_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm ; @@ -3248,6 +3317,7 @@ define amdgpu_kernel void @flat_singlethread_release_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -3264,6 +3334,7 @@ define amdgpu_kernel void @flat_singlethread_release_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -3294,6 +3365,7 @@ define amdgpu_kernel void @flat_singlethread_release_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -3403,7 +3475,9 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_singlethread_acq_rel_monotonic_cmpxchg: @@ -3486,7 +3560,9 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acq_rel_monotonic_cmpxchg: @@ -3502,7 +3578,9 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_acq_rel_monotonic_cmpxchg: @@ -3532,7 +3610,9 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_singlethread_acq_rel_monotonic_cmpxchg: @@ -3641,7 +3721,9 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_singlethread_seq_cst_monotonic_cmpxchg: @@ -3724,7 +3806,9 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_monotonic_cmpxchg: @@ -3740,7 +3824,9 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_seq_cst_monotonic_cmpxchg: @@ -3770,7 +3856,9 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_singlethread_seq_cst_monotonic_cmpxchg: @@ -3880,6 +3968,7 @@ define amdgpu_kernel void @flat_singlethread_monotonic_acquire_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_singlethread_monotonic_acquire_cmpxchg: @@ -3963,6 +4052,7 @@ define amdgpu_kernel void @flat_singlethread_monotonic_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_acquire_cmpxchg: @@ -3979,6 +4069,7 @@ define amdgpu_kernel void @flat_singlethread_monotonic_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_monotonic_acquire_cmpxchg: @@ -4009,6 +4100,7 @@ define amdgpu_kernel void @flat_singlethread_monotonic_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_singlethread_monotonic_acquire_cmpxchg: @@ -4118,6 +4210,7 @@ define amdgpu_kernel void @flat_singlethread_acquire_acquire_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_singlethread_acquire_acquire_cmpxchg: @@ -4201,6 +4294,7 @@ define amdgpu_kernel void @flat_singlethread_acquire_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acquire_acquire_cmpxchg: @@ -4217,6 +4311,7 @@ define amdgpu_kernel void @flat_singlethread_acquire_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_acquire_acquire_cmpxchg: @@ -4247,6 +4342,7 @@ define amdgpu_kernel void @flat_singlethread_acquire_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_singlethread_acquire_acquire_cmpxchg: @@ -4355,7 +4451,9 @@ define amdgpu_kernel void @flat_singlethread_release_acquire_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_singlethread_release_acquire_cmpxchg: @@ -4438,7 +4536,9 @@ define amdgpu_kernel void @flat_singlethread_release_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_release_acquire_cmpxchg: @@ -4454,7 +4554,9 @@ define amdgpu_kernel void @flat_singlethread_release_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_release_acquire_cmpxchg: @@ -4484,7 +4586,9 @@ define amdgpu_kernel void @flat_singlethread_release_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_singlethread_release_acquire_cmpxchg: @@ -4593,7 +4697,9 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_singlethread_acq_rel_acquire_cmpxchg: @@ -4676,7 +4782,9 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acq_rel_acquire_cmpxchg: @@ -4692,7 +4800,9 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_acq_rel_acquire_cmpxchg: @@ -4722,7 +4832,9 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_singlethread_acq_rel_acquire_cmpxchg: @@ -4831,7 +4943,9 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_singlethread_seq_cst_acquire_cmpxchg: @@ -4914,7 +5028,9 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_acquire_cmpxchg: @@ -4930,7 +5046,9 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_seq_cst_acquire_cmpxchg: @@ -4960,7 +5078,9 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_singlethread_seq_cst_acquire_cmpxchg: @@ -5069,7 +5189,9 @@ define amdgpu_kernel void @flat_singlethread_monotonic_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_singlethread_monotonic_seq_cst_cmpxchg: @@ -5152,7 +5274,9 @@ define amdgpu_kernel void @flat_singlethread_monotonic_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_seq_cst_cmpxchg: @@ -5168,7 +5292,9 @@ define amdgpu_kernel void @flat_singlethread_monotonic_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_monotonic_seq_cst_cmpxchg: @@ -5198,7 +5324,9 @@ define amdgpu_kernel void @flat_singlethread_monotonic_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_singlethread_monotonic_seq_cst_cmpxchg: @@ -5307,7 +5435,9 @@ define amdgpu_kernel void @flat_singlethread_acquire_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_singlethread_acquire_seq_cst_cmpxchg: @@ -5390,7 +5520,9 @@ define amdgpu_kernel void @flat_singlethread_acquire_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acquire_seq_cst_cmpxchg: @@ -5406,7 +5538,9 @@ define amdgpu_kernel void @flat_singlethread_acquire_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_acquire_seq_cst_cmpxchg: @@ -5436,7 +5570,9 @@ define amdgpu_kernel void @flat_singlethread_acquire_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_singlethread_acquire_seq_cst_cmpxchg: @@ -5545,7 +5681,9 @@ define amdgpu_kernel void @flat_singlethread_release_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_singlethread_release_seq_cst_cmpxchg: @@ -5628,7 +5766,9 @@ define amdgpu_kernel void @flat_singlethread_release_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_release_seq_cst_cmpxchg: @@ -5644,7 +5784,9 @@ define amdgpu_kernel void @flat_singlethread_release_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_release_seq_cst_cmpxchg: @@ -5674,7 +5816,9 @@ define amdgpu_kernel void @flat_singlethread_release_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_singlethread_release_seq_cst_cmpxchg: @@ -5783,7 +5927,9 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_singlethread_acq_rel_seq_cst_cmpxchg: @@ -5866,7 +6012,9 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acq_rel_seq_cst_cmpxchg: @@ -5882,7 +6030,9 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_acq_rel_seq_cst_cmpxchg: @@ -5912,7 +6062,9 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_singlethread_acq_rel_seq_cst_cmpxchg: @@ -6021,7 +6173,9 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_singlethread_seq_cst_seq_cst_cmpxchg: @@ -6104,7 +6258,9 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_seq_cst_cmpxchg: @@ -6120,7 +6276,9 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_seq_cst_seq_cst_cmpxchg: @@ -6150,7 +6308,9 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_singlethread_seq_cst_seq_cst_cmpxchg: @@ -6544,9 +6704,10 @@ define amdgpu_kernel void @flat_singlethread_acquire_monotonic_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -6639,9 +6800,10 @@ define amdgpu_kernel void @flat_singlethread_acquire_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -6659,8 +6821,9 @@ define amdgpu_kernel void @flat_singlethread_acquire_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -6695,8 +6858,9 @@ define amdgpu_kernel void @flat_singlethread_acquire_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -6827,6 +6991,7 @@ define amdgpu_kernel void @flat_singlethread_release_monotonic_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -6922,6 +7087,7 @@ define amdgpu_kernel void @flat_singlethread_release_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 @@ -6942,6 +7108,7 @@ define amdgpu_kernel void @flat_singlethread_release_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -6978,6 +7145,7 @@ define amdgpu_kernel void @flat_singlethread_release_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -7111,10 +7279,12 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -7206,10 +7376,12 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -7226,9 +7398,11 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -7262,9 +7436,11 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -7395,10 +7571,12 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -7490,10 +7668,12 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -7510,9 +7690,11 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -7546,9 +7728,11 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -7680,9 +7864,10 @@ define amdgpu_kernel void @flat_singlethread_monotonic_acquire_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -7775,9 +7960,10 @@ define amdgpu_kernel void @flat_singlethread_monotonic_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -7795,8 +7981,9 @@ define amdgpu_kernel void @flat_singlethread_monotonic_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -7831,8 +8018,9 @@ define amdgpu_kernel void @flat_singlethread_monotonic_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -7964,9 +8152,10 @@ define amdgpu_kernel void @flat_singlethread_acquire_acquire_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -8059,9 +8248,10 @@ define amdgpu_kernel void @flat_singlethread_acquire_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -8079,8 +8269,9 @@ define amdgpu_kernel void @flat_singlethread_acquire_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -8115,8 +8306,9 @@ define amdgpu_kernel void @flat_singlethread_acquire_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -8247,10 +8439,12 @@ define amdgpu_kernel void @flat_singlethread_release_acquire_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -8342,10 +8536,12 @@ define amdgpu_kernel void @flat_singlethread_release_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -8362,9 +8558,11 @@ define amdgpu_kernel void @flat_singlethread_release_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -8398,9 +8596,11 @@ define amdgpu_kernel void @flat_singlethread_release_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -8531,10 +8731,12 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -8626,10 +8828,12 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -8646,9 +8850,11 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -8682,9 +8888,11 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -8815,10 +9023,12 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -8910,10 +9120,12 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -8930,9 +9142,11 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -8966,9 +9180,11 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -9099,10 +9315,12 @@ define amdgpu_kernel void @flat_singlethread_monotonic_seq_cst_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -9194,10 +9412,12 @@ define amdgpu_kernel void @flat_singlethread_monotonic_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -9214,9 +9434,11 @@ define amdgpu_kernel void @flat_singlethread_monotonic_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -9250,9 +9472,11 @@ define amdgpu_kernel void @flat_singlethread_monotonic_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -9383,10 +9607,12 @@ define amdgpu_kernel void @flat_singlethread_acquire_seq_cst_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -9478,10 +9704,12 @@ define amdgpu_kernel void @flat_singlethread_acquire_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -9498,9 +9726,11 @@ define amdgpu_kernel void @flat_singlethread_acquire_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -9534,9 +9764,11 @@ define amdgpu_kernel void @flat_singlethread_acquire_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -9667,10 +9899,12 @@ define amdgpu_kernel void @flat_singlethread_release_seq_cst_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -9762,10 +9996,12 @@ define amdgpu_kernel void @flat_singlethread_release_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -9782,9 +10018,11 @@ define amdgpu_kernel void @flat_singlethread_release_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -9818,9 +10056,11 @@ define amdgpu_kernel void @flat_singlethread_release_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -9951,10 +10191,12 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_seq_cst_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -10046,10 +10288,12 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -10066,9 +10310,11 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -10102,9 +10348,11 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -10235,10 +10483,12 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -10330,10 +10580,12 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -10350,9 +10602,11 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -10386,9 +10640,11 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -10869,9 +11125,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_load( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: flat_load_dword v2, v[0:1] +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -10919,9 +11176,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_load( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: flat_load_dword v2, v[0:1] +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -10934,8 +11192,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_load( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -10960,8 +11219,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_load( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX942-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -11050,10 +11310,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_load( ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_load_dword v2, v[0:1] +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -11100,10 +11362,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_load( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_load_dword v2, v[0:1] +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -11115,9 +11379,11 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_load( ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -11141,9 +11407,11 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_load( ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -11531,6 +11799,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_store( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -11572,6 +11841,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_store( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -11584,6 +11854,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_store( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -11606,6 +11877,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_store( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -11680,6 +11952,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_store( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -11721,6 +11994,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_store( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -11733,6 +12007,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_store( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -11755,6 +12030,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_store( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -11979,6 +12255,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_atomicrmw( ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_acquire_atomicrmw: @@ -12020,6 +12297,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_atomicrmw( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_atomicrmw: @@ -12032,6 +12310,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_atomicrmw: @@ -12054,6 +12333,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_atomicrmw: @@ -12127,6 +12407,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_atomicrmw( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_swap v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -12168,6 +12449,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_atomicrmw( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -12180,6 +12462,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -12202,6 +12485,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -12276,7 +12560,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_atomicrmw( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_acq_rel_atomicrmw: @@ -12317,7 +12603,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_atomicrmw( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_atomicrmw: @@ -12329,7 +12617,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_atomicrmw: @@ -12351,7 +12641,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_atomicrmw: @@ -12425,7 +12717,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_atomicrmw( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_seq_cst_atomicrmw: @@ -12466,7 +12760,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_atomicrmw( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_atomicrmw: @@ -12478,7 +12774,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_atomicrmw: @@ -12500,7 +12798,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_atomicrmw: @@ -12575,9 +12875,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_ret_atomicrmw( ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s6 ; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -12628,9 +12929,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_ret_atomicrmw( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 ; SKIP-CACHE-INV-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -12644,8 +12946,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s6 ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -12672,8 +12975,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_ret_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -12768,10 +13072,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_ret_atomicrmw( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s6 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -12821,10 +13127,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_ret_atomicrmw( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -12837,9 +13145,11 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -12865,9 +13175,11 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_ret_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -12962,10 +13274,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_ret_atomicrmw( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s6 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -13015,10 +13329,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_ret_atomicrmw( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -13031,9 +13347,11 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -13059,9 +13377,11 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_ret_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -13409,6 +13729,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_acquire_monotonic_cmpxchg: @@ -13492,6 +13813,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_monotonic_cmpxchg: @@ -13508,6 +13830,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_monotonic_cmpxchg: @@ -13538,6 +13861,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_monotonic_cmpxchg: @@ -13646,6 +13970,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm ; @@ -13729,6 +14054,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -13745,6 +14071,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -13775,6 +14102,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -13884,7 +14212,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_acq_rel_monotonic_cmpxchg: @@ -13967,7 +14297,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_monotonic_cmpxchg: @@ -13983,7 +14315,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_monotonic_cmpxchg: @@ -14013,7 +14347,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_monotonic_cmpxchg: @@ -14122,7 +14458,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_seq_cst_monotonic_cmpxchg: @@ -14205,7 +14543,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_monotonic_cmpxchg: @@ -14221,7 +14561,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_monotonic_cmpxchg: @@ -14251,7 +14593,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_monotonic_cmpxchg: @@ -14361,6 +14705,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_acquire_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_monotonic_acquire_cmpxchg: @@ -14444,6 +14789,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_acquire_cmpxchg: @@ -14460,6 +14806,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_acquire_cmpxchg: @@ -14490,6 +14837,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_acquire_cmpxchg: @@ -14599,6 +14947,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_acquire_acquire_cmpxchg: @@ -14682,6 +15031,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_acquire_cmpxchg: @@ -14698,6 +15048,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_acquire_cmpxchg: @@ -14728,6 +15079,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_acquire_cmpxchg: @@ -14836,7 +15188,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_release_acquire_cmpxchg: @@ -14919,7 +15273,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_release_acquire_cmpxchg: @@ -14935,7 +15291,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_release_acquire_cmpxchg: @@ -14965,7 +15323,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_singlethread_one_as_release_acquire_cmpxchg: @@ -15074,7 +15434,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_acq_rel_acquire_cmpxchg: @@ -15157,7 +15519,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_acquire_cmpxchg: @@ -15173,7 +15537,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_acquire_cmpxchg: @@ -15203,7 +15569,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_acquire_cmpxchg: @@ -15312,7 +15680,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_seq_cst_acquire_cmpxchg: @@ -15395,7 +15765,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_acquire_cmpxchg: @@ -15411,7 +15783,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_acquire_cmpxchg: @@ -15441,7 +15815,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_acquire_cmpxchg: @@ -15550,7 +15926,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_monotonic_seq_cst_cmpxchg: @@ -15633,7 +16011,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_seq_cst_cmpxchg: @@ -15649,7 +16029,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_seq_cst_cmpxchg: @@ -15679,7 +16061,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_seq_cst_cmpxchg: @@ -15788,7 +16172,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_acquire_seq_cst_cmpxchg: @@ -15871,7 +16257,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_seq_cst_cmpxchg: @@ -15887,7 +16275,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_seq_cst_cmpxchg: @@ -15917,7 +16307,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_seq_cst_cmpxchg: @@ -16026,7 +16418,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_release_seq_cst_cmpxchg: @@ -16109,7 +16503,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_release_seq_cst_cmpxchg: @@ -16125,7 +16521,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_release_seq_cst_cmpxchg: @@ -16155,7 +16553,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_singlethread_one_as_release_seq_cst_cmpxchg: @@ -16264,7 +16664,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg: @@ -16347,7 +16749,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg: @@ -16363,7 +16767,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg: @@ -16393,7 +16799,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg: @@ -16502,7 +16910,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg: @@ -16585,7 +16995,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg: @@ -16601,7 +17013,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg: @@ -16631,7 +17045,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg: @@ -17025,9 +17441,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_ret_cmpxch ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -17120,9 +17537,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_ret_cmpxch ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -17140,8 +17558,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_ret_cmpxch ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -17176,8 +17595,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_ret_cmpxch ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -17308,6 +17728,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_ret_cmpxch ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -17403,6 +17824,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_ret_cmpxch ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 @@ -17423,6 +17845,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_ret_cmpxch ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -17459,6 +17882,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_ret_cmpxch ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -17592,10 +18016,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxch ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -17687,10 +18113,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxch ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -17707,9 +18135,11 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxch ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -17743,9 +18173,11 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxch ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -17876,10 +18308,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxch ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -17971,10 +18405,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxch ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -17991,9 +18427,11 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxch ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -18027,9 +18465,11 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxch ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -18161,9 +18601,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_acquire_ret_cmpxch ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -18256,9 +18697,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_acquire_ret_cmpxch ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -18276,8 +18718,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_acquire_ret_cmpxch ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -18312,8 +18755,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_acquire_ret_cmpxch ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -18445,9 +18889,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -18540,9 +18985,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -18560,8 +19006,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -18596,8 +19043,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -18728,10 +19176,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -18823,10 +19273,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -18843,9 +19295,11 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -18879,9 +19333,11 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -19012,10 +19468,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -19107,10 +19565,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -19127,9 +19587,11 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -19163,9 +19625,11 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -19296,10 +19760,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -19391,10 +19857,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -19411,9 +19879,11 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -19447,9 +19917,11 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -19580,10 +20052,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxch ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -19675,10 +20149,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxch ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -19695,9 +20171,11 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxch ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -19731,9 +20209,11 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxch ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -19864,10 +20344,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -19959,10 +20441,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -19979,9 +20463,11 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -20015,9 +20501,11 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -20148,10 +20636,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_seq_cst_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -20243,10 +20733,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -20263,9 +20755,11 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -20299,9 +20793,11 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -20432,10 +20928,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -20527,10 +21025,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -20547,9 +21047,11 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -20583,9 +21085,11 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -20716,10 +21220,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -20811,10 +21317,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -20831,9 +21339,11 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -20867,9 +21377,11 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-volatile.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-volatile.ll index a88e0e217fdb4..e9475cefffab4 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-volatile.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-volatile.ll @@ -852,10 +852,9 @@ define amdgpu_kernel void @flat_volatile_workgroup_acquire_load( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: flat_load_dword v2, v[0:1] -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -905,10 +904,9 @@ define amdgpu_kernel void @flat_volatile_workgroup_acquire_load( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: flat_load_dword v2, v[0:1] -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -990,7 +988,7 @@ define amdgpu_kernel void @flat_volatile_workgroup_release_store( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -1035,7 +1033,7 @@ define amdgpu_kernel void @flat_volatile_workgroup_release_store( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-wavefront.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-wavefront.ll index 7c637a20ab47b..d9177a0faf0e1 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-wavefront.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-wavefront.ll @@ -388,9 +388,10 @@ define amdgpu_kernel void @flat_wavefront_acquire_load( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: flat_load_dword v2, v[0:1] +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -438,9 +439,10 @@ define amdgpu_kernel void @flat_wavefront_acquire_load( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: flat_load_dword v2, v[0:1] +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -453,8 +455,9 @@ define amdgpu_kernel void @flat_wavefront_acquire_load( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -479,8 +482,9 @@ define amdgpu_kernel void @flat_wavefront_acquire_load( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX942-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -569,10 +573,12 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_load( ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_load_dword v2, v[0:1] +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -619,10 +625,12 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_load( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_load_dword v2, v[0:1] +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -634,9 +642,11 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_load( ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -660,9 +670,11 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_load( ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -1050,6 +1062,7 @@ define amdgpu_kernel void @flat_wavefront_release_store( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -1091,6 +1104,7 @@ define amdgpu_kernel void @flat_wavefront_release_store( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -1103,6 +1117,7 @@ define amdgpu_kernel void @flat_wavefront_release_store( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -1125,6 +1140,7 @@ define amdgpu_kernel void @flat_wavefront_release_store( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -1199,6 +1215,7 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_store( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -1240,6 +1257,7 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_store( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -1252,6 +1270,7 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_store( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -1274,6 +1293,7 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_store( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -1498,6 +1518,7 @@ define amdgpu_kernel void @flat_wavefront_acquire_atomicrmw( ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_wavefront_acquire_atomicrmw: @@ -1539,6 +1560,7 @@ define amdgpu_kernel void @flat_wavefront_acquire_atomicrmw( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acquire_atomicrmw: @@ -1551,6 +1573,7 @@ define amdgpu_kernel void @flat_wavefront_acquire_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_acquire_atomicrmw: @@ -1573,6 +1596,7 @@ define amdgpu_kernel void @flat_wavefront_acquire_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_wavefront_acquire_atomicrmw: @@ -1646,6 +1670,7 @@ define amdgpu_kernel void @flat_wavefront_release_atomicrmw( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_swap v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -1687,6 +1712,7 @@ define amdgpu_kernel void @flat_wavefront_release_atomicrmw( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -1699,6 +1725,7 @@ define amdgpu_kernel void @flat_wavefront_release_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -1721,6 +1748,7 @@ define amdgpu_kernel void @flat_wavefront_release_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -1795,7 +1823,9 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_atomicrmw( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_wavefront_acq_rel_atomicrmw: @@ -1836,7 +1866,9 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_atomicrmw( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acq_rel_atomicrmw: @@ -1848,7 +1880,9 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_acq_rel_atomicrmw: @@ -1870,7 +1904,9 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_wavefront_acq_rel_atomicrmw: @@ -1944,7 +1980,9 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_atomicrmw( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_wavefront_seq_cst_atomicrmw: @@ -1985,7 +2023,9 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_atomicrmw( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_seq_cst_atomicrmw: @@ -1997,7 +2037,9 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_seq_cst_atomicrmw: @@ -2019,7 +2061,9 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_wavefront_seq_cst_atomicrmw: @@ -2094,9 +2138,10 @@ define amdgpu_kernel void @flat_wavefront_acquire_ret_atomicrmw( ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s6 ; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -2147,9 +2192,10 @@ define amdgpu_kernel void @flat_wavefront_acquire_ret_atomicrmw( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 ; SKIP-CACHE-INV-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -2163,8 +2209,9 @@ define amdgpu_kernel void @flat_wavefront_acquire_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s6 ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -2191,8 +2238,9 @@ define amdgpu_kernel void @flat_wavefront_acquire_ret_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -2287,10 +2335,12 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_ret_atomicrmw( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s6 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -2340,10 +2390,12 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_ret_atomicrmw( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -2356,9 +2408,11 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -2384,9 +2438,11 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_ret_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -2481,10 +2537,12 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_ret_atomicrmw( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s6 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -2534,10 +2592,12 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_ret_atomicrmw( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -2550,9 +2610,11 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -2578,9 +2640,11 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_ret_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -2928,6 +2992,7 @@ define amdgpu_kernel void @flat_wavefront_acquire_monotonic_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_wavefront_acquire_monotonic_cmpxchg: @@ -3011,6 +3076,7 @@ define amdgpu_kernel void @flat_wavefront_acquire_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acquire_monotonic_cmpxchg: @@ -3027,6 +3093,7 @@ define amdgpu_kernel void @flat_wavefront_acquire_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_acquire_monotonic_cmpxchg: @@ -3057,6 +3124,7 @@ define amdgpu_kernel void @flat_wavefront_acquire_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_wavefront_acquire_monotonic_cmpxchg: @@ -3165,6 +3233,7 @@ define amdgpu_kernel void @flat_wavefront_release_monotonic_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm ; @@ -3248,6 +3317,7 @@ define amdgpu_kernel void @flat_wavefront_release_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -3264,6 +3334,7 @@ define amdgpu_kernel void @flat_wavefront_release_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -3294,6 +3365,7 @@ define amdgpu_kernel void @flat_wavefront_release_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -3403,7 +3475,9 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_monotonic_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_wavefront_acq_rel_monotonic_cmpxchg: @@ -3486,7 +3560,9 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acq_rel_monotonic_cmpxchg: @@ -3502,7 +3578,9 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_acq_rel_monotonic_cmpxchg: @@ -3532,7 +3610,9 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_wavefront_acq_rel_monotonic_cmpxchg: @@ -3641,7 +3721,9 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_monotonic_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_wavefront_seq_cst_monotonic_cmpxchg: @@ -3724,7 +3806,9 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_seq_cst_monotonic_cmpxchg: @@ -3740,7 +3824,9 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_seq_cst_monotonic_cmpxchg: @@ -3770,7 +3856,9 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_wavefront_seq_cst_monotonic_cmpxchg: @@ -3880,6 +3968,7 @@ define amdgpu_kernel void @flat_wavefront_monotonic_acquire_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_wavefront_monotonic_acquire_cmpxchg: @@ -3963,6 +4052,7 @@ define amdgpu_kernel void @flat_wavefront_monotonic_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_monotonic_acquire_cmpxchg: @@ -3979,6 +4069,7 @@ define amdgpu_kernel void @flat_wavefront_monotonic_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_monotonic_acquire_cmpxchg: @@ -4009,6 +4100,7 @@ define amdgpu_kernel void @flat_wavefront_monotonic_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_wavefront_monotonic_acquire_cmpxchg: @@ -4118,6 +4210,7 @@ define amdgpu_kernel void @flat_wavefront_acquire_acquire_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_wavefront_acquire_acquire_cmpxchg: @@ -4201,6 +4294,7 @@ define amdgpu_kernel void @flat_wavefront_acquire_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acquire_acquire_cmpxchg: @@ -4217,6 +4311,7 @@ define amdgpu_kernel void @flat_wavefront_acquire_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_acquire_acquire_cmpxchg: @@ -4247,6 +4342,7 @@ define amdgpu_kernel void @flat_wavefront_acquire_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_wavefront_acquire_acquire_cmpxchg: @@ -4355,7 +4451,9 @@ define amdgpu_kernel void @flat_wavefront_release_acquire_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_wavefront_release_acquire_cmpxchg: @@ -4438,7 +4536,9 @@ define amdgpu_kernel void @flat_wavefront_release_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_release_acquire_cmpxchg: @@ -4454,7 +4554,9 @@ define amdgpu_kernel void @flat_wavefront_release_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_release_acquire_cmpxchg: @@ -4484,7 +4586,9 @@ define amdgpu_kernel void @flat_wavefront_release_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_wavefront_release_acquire_cmpxchg: @@ -4593,7 +4697,9 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_acquire_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_wavefront_acq_rel_acquire_cmpxchg: @@ -4676,7 +4782,9 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acq_rel_acquire_cmpxchg: @@ -4692,7 +4800,9 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_acq_rel_acquire_cmpxchg: @@ -4722,7 +4832,9 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_wavefront_acq_rel_acquire_cmpxchg: @@ -4831,7 +4943,9 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_acquire_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_wavefront_seq_cst_acquire_cmpxchg: @@ -4914,7 +5028,9 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_seq_cst_acquire_cmpxchg: @@ -4930,7 +5046,9 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_seq_cst_acquire_cmpxchg: @@ -4960,7 +5078,9 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_wavefront_seq_cst_acquire_cmpxchg: @@ -5069,7 +5189,9 @@ define amdgpu_kernel void @flat_wavefront_monotonic_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_wavefront_monotonic_seq_cst_cmpxchg: @@ -5152,7 +5274,9 @@ define amdgpu_kernel void @flat_wavefront_monotonic_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_monotonic_seq_cst_cmpxchg: @@ -5168,7 +5292,9 @@ define amdgpu_kernel void @flat_wavefront_monotonic_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_monotonic_seq_cst_cmpxchg: @@ -5198,7 +5324,9 @@ define amdgpu_kernel void @flat_wavefront_monotonic_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_wavefront_monotonic_seq_cst_cmpxchg: @@ -5307,7 +5435,9 @@ define amdgpu_kernel void @flat_wavefront_acquire_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_wavefront_acquire_seq_cst_cmpxchg: @@ -5390,7 +5520,9 @@ define amdgpu_kernel void @flat_wavefront_acquire_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acquire_seq_cst_cmpxchg: @@ -5406,7 +5538,9 @@ define amdgpu_kernel void @flat_wavefront_acquire_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_acquire_seq_cst_cmpxchg: @@ -5436,7 +5570,9 @@ define amdgpu_kernel void @flat_wavefront_acquire_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_wavefront_acquire_seq_cst_cmpxchg: @@ -5545,7 +5681,9 @@ define amdgpu_kernel void @flat_wavefront_release_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_wavefront_release_seq_cst_cmpxchg: @@ -5628,7 +5766,9 @@ define amdgpu_kernel void @flat_wavefront_release_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_release_seq_cst_cmpxchg: @@ -5644,7 +5784,9 @@ define amdgpu_kernel void @flat_wavefront_release_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_release_seq_cst_cmpxchg: @@ -5674,7 +5816,9 @@ define amdgpu_kernel void @flat_wavefront_release_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_wavefront_release_seq_cst_cmpxchg: @@ -5783,7 +5927,9 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_wavefront_acq_rel_seq_cst_cmpxchg: @@ -5866,7 +6012,9 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acq_rel_seq_cst_cmpxchg: @@ -5882,7 +6030,9 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_acq_rel_seq_cst_cmpxchg: @@ -5912,7 +6062,9 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_wavefront_acq_rel_seq_cst_cmpxchg: @@ -6021,7 +6173,9 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_wavefront_seq_cst_seq_cst_cmpxchg: @@ -6104,7 +6258,9 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_seq_cst_seq_cst_cmpxchg: @@ -6120,7 +6276,9 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_seq_cst_seq_cst_cmpxchg: @@ -6150,7 +6308,9 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_wavefront_seq_cst_seq_cst_cmpxchg: @@ -6544,9 +6704,10 @@ define amdgpu_kernel void @flat_wavefront_acquire_monotonic_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -6639,9 +6800,10 @@ define amdgpu_kernel void @flat_wavefront_acquire_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -6659,8 +6821,9 @@ define amdgpu_kernel void @flat_wavefront_acquire_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -6695,8 +6858,9 @@ define amdgpu_kernel void @flat_wavefront_acquire_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -6827,6 +6991,7 @@ define amdgpu_kernel void @flat_wavefront_release_monotonic_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -6922,6 +7087,7 @@ define amdgpu_kernel void @flat_wavefront_release_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 @@ -6942,6 +7108,7 @@ define amdgpu_kernel void @flat_wavefront_release_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -6978,6 +7145,7 @@ define amdgpu_kernel void @flat_wavefront_release_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -7111,10 +7279,12 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_monotonic_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -7206,10 +7376,12 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -7226,9 +7398,11 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -7262,9 +7436,11 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -7395,10 +7571,12 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_monotonic_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -7490,10 +7668,12 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -7510,9 +7690,11 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -7546,9 +7728,11 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -7680,9 +7864,10 @@ define amdgpu_kernel void @flat_wavefront_monotonic_acquire_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -7775,9 +7960,10 @@ define amdgpu_kernel void @flat_wavefront_monotonic_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -7795,8 +7981,9 @@ define amdgpu_kernel void @flat_wavefront_monotonic_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -7831,8 +8018,9 @@ define amdgpu_kernel void @flat_wavefront_monotonic_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -7964,9 +8152,10 @@ define amdgpu_kernel void @flat_wavefront_acquire_acquire_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -8059,9 +8248,10 @@ define amdgpu_kernel void @flat_wavefront_acquire_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -8079,8 +8269,9 @@ define amdgpu_kernel void @flat_wavefront_acquire_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -8115,8 +8306,9 @@ define amdgpu_kernel void @flat_wavefront_acquire_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -8247,10 +8439,12 @@ define amdgpu_kernel void @flat_wavefront_release_acquire_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -8342,10 +8536,12 @@ define amdgpu_kernel void @flat_wavefront_release_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -8362,9 +8558,11 @@ define amdgpu_kernel void @flat_wavefront_release_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -8398,9 +8596,11 @@ define amdgpu_kernel void @flat_wavefront_release_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -8531,10 +8731,12 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_acquire_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -8626,10 +8828,12 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -8646,9 +8850,11 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -8682,9 +8888,11 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -8815,10 +9023,12 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_acquire_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -8910,10 +9120,12 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -8930,9 +9142,11 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -8966,9 +9180,11 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -9099,10 +9315,12 @@ define amdgpu_kernel void @flat_wavefront_monotonic_seq_cst_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -9194,10 +9412,12 @@ define amdgpu_kernel void @flat_wavefront_monotonic_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -9214,9 +9434,11 @@ define amdgpu_kernel void @flat_wavefront_monotonic_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -9250,9 +9472,11 @@ define amdgpu_kernel void @flat_wavefront_monotonic_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -9383,10 +9607,12 @@ define amdgpu_kernel void @flat_wavefront_acquire_seq_cst_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -9478,10 +9704,12 @@ define amdgpu_kernel void @flat_wavefront_acquire_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -9498,9 +9726,11 @@ define amdgpu_kernel void @flat_wavefront_acquire_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -9534,9 +9764,11 @@ define amdgpu_kernel void @flat_wavefront_acquire_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -9667,10 +9899,12 @@ define amdgpu_kernel void @flat_wavefront_release_seq_cst_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -9762,10 +9996,12 @@ define amdgpu_kernel void @flat_wavefront_release_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -9782,9 +10018,11 @@ define amdgpu_kernel void @flat_wavefront_release_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -9818,9 +10056,11 @@ define amdgpu_kernel void @flat_wavefront_release_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -9951,10 +10191,12 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_seq_cst_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -10046,10 +10288,12 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -10066,9 +10310,11 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -10102,9 +10348,11 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -10235,10 +10483,12 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_seq_cst_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -10330,10 +10580,12 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -10350,9 +10602,11 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -10386,9 +10640,11 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -10869,9 +11125,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_load( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: flat_load_dword v2, v[0:1] +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -10919,9 +11176,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_load( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: flat_load_dword v2, v[0:1] +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -10934,8 +11192,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_load( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -10960,8 +11219,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_load( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX942-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -11050,10 +11310,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_load( ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_load_dword v2, v[0:1] +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -11100,10 +11362,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_load( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_load_dword v2, v[0:1] +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -11115,9 +11379,11 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_load( ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -11141,9 +11407,11 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_load( ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -11531,6 +11799,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_store( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -11572,6 +11841,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_store( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -11584,6 +11854,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_store( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -11606,6 +11877,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_store( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -11680,6 +11952,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_store( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -11721,6 +11994,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_store( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -11733,6 +12007,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_store( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -11755,6 +12030,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_store( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -11979,6 +12255,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_atomicrmw( ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_acquire_atomicrmw: @@ -12020,6 +12297,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_atomicrmw( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acquire_atomicrmw: @@ -12032,6 +12310,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acquire_atomicrmw: @@ -12054,6 +12333,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_wavefront_one_as_acquire_atomicrmw: @@ -12127,6 +12407,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_atomicrmw( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_swap v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -12168,6 +12449,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_atomicrmw( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -12180,6 +12462,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -12202,6 +12485,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -12276,7 +12560,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_atomicrmw( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_acq_rel_atomicrmw: @@ -12317,7 +12603,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_atomicrmw( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_atomicrmw: @@ -12329,7 +12617,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_atomicrmw: @@ -12351,7 +12641,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_atomicrmw: @@ -12425,7 +12717,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_atomicrmw( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_seq_cst_atomicrmw: @@ -12466,7 +12760,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_atomicrmw( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_atomicrmw: @@ -12478,7 +12774,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_atomicrmw: @@ -12500,7 +12798,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_atomicrmw: @@ -12575,9 +12875,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_ret_atomicrmw( ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s6 ; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -12628,9 +12929,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_ret_atomicrmw( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 ; SKIP-CACHE-INV-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -12644,8 +12946,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s6 ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -12672,8 +12975,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_ret_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -12768,10 +13072,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_ret_atomicrmw( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s6 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -12821,10 +13127,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_ret_atomicrmw( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -12837,9 +13145,11 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -12865,9 +13175,11 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_ret_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -12962,10 +13274,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_ret_atomicrmw( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s6 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -13015,10 +13329,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_ret_atomicrmw( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -13031,9 +13347,11 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -13059,9 +13377,11 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_ret_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -13409,6 +13729,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_monotonic_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_acquire_monotonic_cmpxchg: @@ -13492,6 +13813,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acquire_monotonic_cmpxchg: @@ -13508,6 +13830,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acquire_monotonic_cmpxchg: @@ -13538,6 +13861,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_wavefront_one_as_acquire_monotonic_cmpxchg: @@ -13646,6 +13970,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_monotonic_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm ; @@ -13729,6 +14054,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -13745,6 +14071,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -13775,6 +14102,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -13884,7 +14212,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_monotonic_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_acq_rel_monotonic_cmpxchg: @@ -13967,7 +14297,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_monotonic_cmpxchg: @@ -13983,7 +14315,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_monotonic_cmpxchg: @@ -14013,7 +14347,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_monotonic_cmpxchg: @@ -14122,7 +14458,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_monotonic_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_seq_cst_monotonic_cmpxchg: @@ -14205,7 +14543,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_monotonic_cmpxchg: @@ -14221,7 +14561,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_monotonic_cmpxchg: @@ -14251,7 +14593,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_monotonic_cmpxchg: @@ -14361,6 +14705,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_acquire_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_monotonic_acquire_cmpxchg: @@ -14444,6 +14789,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_monotonic_acquire_cmpxchg: @@ -14460,6 +14806,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_monotonic_acquire_cmpxchg: @@ -14490,6 +14837,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_wavefront_one_as_monotonic_acquire_cmpxchg: @@ -14599,6 +14947,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_acquire_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_acquire_acquire_cmpxchg: @@ -14682,6 +15031,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acquire_acquire_cmpxchg: @@ -14698,6 +15048,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acquire_acquire_cmpxchg: @@ -14728,6 +15079,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_wavefront_one_as_acquire_acquire_cmpxchg: @@ -14836,7 +15188,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_acquire_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_release_acquire_cmpxchg: @@ -14919,7 +15273,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_release_acquire_cmpxchg: @@ -14935,7 +15291,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_release_acquire_cmpxchg: @@ -14965,7 +15323,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_wavefront_one_as_release_acquire_cmpxchg: @@ -15074,7 +15434,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_acquire_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_acq_rel_acquire_cmpxchg: @@ -15157,7 +15519,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_acquire_cmpxchg: @@ -15173,7 +15537,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_acquire_cmpxchg: @@ -15203,7 +15569,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_acquire_cmpxchg: @@ -15312,7 +15680,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_acquire_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_seq_cst_acquire_cmpxchg: @@ -15395,7 +15765,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_acquire_cmpxchg: @@ -15411,7 +15783,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_acquire_cmpxchg: @@ -15441,7 +15815,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_acquire_cmpxchg: @@ -15550,7 +15926,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_monotonic_seq_cst_cmpxchg: @@ -15633,7 +16011,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_monotonic_seq_cst_cmpxchg: @@ -15649,7 +16029,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_monotonic_seq_cst_cmpxchg: @@ -15679,7 +16061,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_wavefront_one_as_monotonic_seq_cst_cmpxchg: @@ -15788,7 +16172,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_acquire_seq_cst_cmpxchg: @@ -15871,7 +16257,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acquire_seq_cst_cmpxchg: @@ -15887,7 +16275,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acquire_seq_cst_cmpxchg: @@ -15917,7 +16307,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_wavefront_one_as_acquire_seq_cst_cmpxchg: @@ -16026,7 +16418,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_release_seq_cst_cmpxchg: @@ -16109,7 +16503,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_release_seq_cst_cmpxchg: @@ -16125,7 +16521,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_release_seq_cst_cmpxchg: @@ -16155,7 +16553,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_wavefront_one_as_release_seq_cst_cmpxchg: @@ -16264,7 +16664,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg: @@ -16347,7 +16749,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg: @@ -16363,7 +16767,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg: @@ -16393,7 +16799,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg: @@ -16502,7 +16910,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg: @@ -16585,7 +16995,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg: @@ -16601,7 +17013,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg: @@ -16631,7 +17045,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg: @@ -17025,9 +17441,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -17120,9 +17537,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -17140,8 +17558,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -17176,8 +17595,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -17308,10 +17728,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -17403,10 +17825,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -17423,9 +17847,11 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -17459,9 +17885,11 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -17592,10 +18020,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -17687,10 +18117,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -17707,9 +18139,11 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -17743,9 +18177,11 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -17877,9 +18313,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_acquire_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -17972,9 +18409,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -17992,8 +18430,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -18028,8 +18467,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -18161,9 +18601,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_acquire_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -18256,9 +18697,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -18276,8 +18718,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -18312,8 +18755,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -18444,10 +18888,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_acquire_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -18539,10 +18985,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -18559,9 +19007,11 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -18595,9 +19045,11 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -18728,10 +19180,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -18823,10 +19277,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -18843,9 +19299,11 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -18879,9 +19337,11 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -19012,10 +19472,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -19107,10 +19569,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -19127,9 +19591,11 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -19163,9 +19629,11 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -19296,10 +19764,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -19391,10 +19861,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -19411,9 +19883,11 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -19447,9 +19921,11 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -19580,10 +20056,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -19675,10 +20153,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -19695,9 +20175,11 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -19731,9 +20213,11 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -19864,10 +20348,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_seq_cst_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -19959,10 +20445,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -19979,9 +20467,11 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -20015,9 +20505,11 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -20148,10 +20640,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_relc_seq_cst_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -20243,10 +20737,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_relc_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -20263,9 +20759,11 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_relc_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -20299,9 +20797,11 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_relc_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -20432,10 +20932,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -20527,10 +21029,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -20547,9 +21051,11 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -20583,9 +21089,11 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-workgroup.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-workgroup.ll index 0fd4aa4a7a93f..d9729611a5fc1 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-workgroup.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-workgroup.ll @@ -388,10 +388,9 @@ define amdgpu_kernel void @flat_workgroup_acquire_load( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: flat_load_dword v2, v[0:1] -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -441,10 +440,9 @@ define amdgpu_kernel void @flat_workgroup_acquire_load( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: flat_load_dword v2, v[0:1] -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -457,9 +455,8 @@ define amdgpu_kernel void @flat_workgroup_acquire_load( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -485,9 +482,8 @@ define amdgpu_kernel void @flat_workgroup_acquire_load( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX942-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] sc0 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -581,12 +577,11 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_load( ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: flat_load_dword v2, v[0:1] -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -638,12 +633,11 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_load( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_load_dword v2, v[0:1] -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -655,11 +649,10 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_load( ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -685,11 +678,10 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_load( ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] sc0 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -1093,7 +1085,7 @@ define amdgpu_kernel void @flat_workgroup_release_store( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -1138,7 +1130,7 @@ define amdgpu_kernel void @flat_workgroup_release_store( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -1151,7 +1143,7 @@ define amdgpu_kernel void @flat_workgroup_release_store( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -1175,7 +1167,7 @@ define amdgpu_kernel void @flat_workgroup_release_store( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -1259,7 +1251,7 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_store( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -1304,7 +1296,7 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_store( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -1317,7 +1309,7 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_store( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -1341,7 +1333,7 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_store( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -1575,7 +1567,7 @@ define amdgpu_kernel void @flat_workgroup_acquire_atomicrmw( ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: flat_atomic_swap v[0:1], v2 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_workgroup_acquire_atomicrmw: @@ -1621,7 +1613,7 @@ define amdgpu_kernel void @flat_workgroup_acquire_atomicrmw( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acquire_atomicrmw: @@ -1634,7 +1626,7 @@ define amdgpu_kernel void @flat_workgroup_acquire_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_acquire_atomicrmw: @@ -1659,7 +1651,7 @@ define amdgpu_kernel void @flat_workgroup_acquire_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_workgroup_acquire_atomicrmw: @@ -1742,7 +1734,7 @@ define amdgpu_kernel void @flat_workgroup_release_atomicrmw( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: flat_atomic_swap v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -1787,7 +1779,7 @@ define amdgpu_kernel void @flat_workgroup_release_atomicrmw( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -1800,7 +1792,7 @@ define amdgpu_kernel void @flat_workgroup_release_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -1824,7 +1816,7 @@ define amdgpu_kernel void @flat_workgroup_release_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -1908,9 +1900,9 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_atomicrmw( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: flat_atomic_swap v[0:1], v2 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_workgroup_acq_rel_atomicrmw: @@ -1958,9 +1950,9 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_atomicrmw( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acq_rel_atomicrmw: @@ -1972,9 +1964,9 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_acq_rel_atomicrmw: @@ -1999,9 +1991,9 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_workgroup_acq_rel_atomicrmw: @@ -2093,9 +2085,9 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_atomicrmw( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: flat_atomic_swap v[0:1], v2 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_workgroup_seq_cst_atomicrmw: @@ -2143,9 +2135,9 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_atomicrmw( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_atomicrmw: @@ -2157,9 +2149,9 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_seq_cst_atomicrmw: @@ -2184,9 +2176,9 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_workgroup_seq_cst_atomicrmw: @@ -2279,10 +2271,9 @@ define amdgpu_kernel void @flat_workgroup_acquire_ret_atomicrmw( ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s6 ; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -2335,10 +2326,9 @@ define amdgpu_kernel void @flat_workgroup_acquire_ret_atomicrmw( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 ; SKIP-CACHE-INV-NEXT: flat_atomic_swap v2, v[0:1], v2 glc -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -2352,9 +2342,8 @@ define amdgpu_kernel void @flat_workgroup_acquire_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s6 ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -2382,9 +2371,8 @@ define amdgpu_kernel void @flat_workgroup_acquire_ret_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 sc0 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -2484,12 +2472,11 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_ret_atomicrmw( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s6 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -2544,12 +2531,11 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_ret_atomicrmw( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_swap v2, v[0:1], v2 glc -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -2562,11 +2548,10 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s6 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -2594,11 +2579,10 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_ret_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 sc0 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -2709,12 +2693,11 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_ret_atomicrmw( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s6 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -2769,12 +2752,11 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_ret_atomicrmw( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_swap v2, v[0:1], v2 glc -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -2787,11 +2769,10 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s6 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -2819,11 +2800,10 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_ret_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 sc0 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -3187,7 +3167,7 @@ define amdgpu_kernel void @flat_workgroup_acquire_monotonic_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_workgroup_acquire_monotonic_cmpxchg: @@ -3275,7 +3255,7 @@ define amdgpu_kernel void @flat_workgroup_acquire_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acquire_monotonic_cmpxchg: @@ -3292,7 +3272,7 @@ define amdgpu_kernel void @flat_workgroup_acquire_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_acquire_monotonic_cmpxchg: @@ -3325,7 +3305,7 @@ define amdgpu_kernel void @flat_workgroup_acquire_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_workgroup_acquire_monotonic_cmpxchg: @@ -3443,7 +3423,7 @@ define amdgpu_kernel void @flat_workgroup_release_monotonic_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm ; @@ -3530,7 +3510,7 @@ define amdgpu_kernel void @flat_workgroup_release_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -3547,7 +3527,7 @@ define amdgpu_kernel void @flat_workgroup_release_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -3579,7 +3559,7 @@ define amdgpu_kernel void @flat_workgroup_release_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -3698,9 +3678,9 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_workgroup_acq_rel_monotonic_cmpxchg: @@ -3790,9 +3770,9 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acq_rel_monotonic_cmpxchg: @@ -3808,9 +3788,9 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_acq_rel_monotonic_cmpxchg: @@ -3843,9 +3823,9 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_workgroup_acq_rel_monotonic_cmpxchg: @@ -3972,9 +3952,9 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_workgroup_seq_cst_monotonic_cmpxchg: @@ -4064,9 +4044,9 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_monotonic_cmpxchg: @@ -4082,9 +4062,9 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_seq_cst_monotonic_cmpxchg: @@ -4117,9 +4097,9 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_workgroup_seq_cst_monotonic_cmpxchg: @@ -4247,7 +4227,7 @@ define amdgpu_kernel void @flat_workgroup_monotonic_acquire_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_workgroup_monotonic_acquire_cmpxchg: @@ -4335,7 +4315,7 @@ define amdgpu_kernel void @flat_workgroup_monotonic_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_monotonic_acquire_cmpxchg: @@ -4352,7 +4332,7 @@ define amdgpu_kernel void @flat_workgroup_monotonic_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_monotonic_acquire_cmpxchg: @@ -4385,7 +4365,7 @@ define amdgpu_kernel void @flat_workgroup_monotonic_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_workgroup_monotonic_acquire_cmpxchg: @@ -4504,7 +4484,7 @@ define amdgpu_kernel void @flat_workgroup_acquire_acquire_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_workgroup_acquire_acquire_cmpxchg: @@ -4592,7 +4572,7 @@ define amdgpu_kernel void @flat_workgroup_acquire_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acquire_acquire_cmpxchg: @@ -4609,7 +4589,7 @@ define amdgpu_kernel void @flat_workgroup_acquire_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_acquire_acquire_cmpxchg: @@ -4642,7 +4622,7 @@ define amdgpu_kernel void @flat_workgroup_acquire_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_workgroup_acquire_acquire_cmpxchg: @@ -4760,9 +4740,9 @@ define amdgpu_kernel void @flat_workgroup_release_acquire_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_workgroup_release_acquire_cmpxchg: @@ -4852,9 +4832,9 @@ define amdgpu_kernel void @flat_workgroup_release_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_release_acquire_cmpxchg: @@ -4870,9 +4850,9 @@ define amdgpu_kernel void @flat_workgroup_release_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_release_acquire_cmpxchg: @@ -4905,9 +4885,9 @@ define amdgpu_kernel void @flat_workgroup_release_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_workgroup_release_acquire_cmpxchg: @@ -5034,9 +5014,9 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_workgroup_acq_rel_acquire_cmpxchg: @@ -5126,9 +5106,9 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acq_rel_acquire_cmpxchg: @@ -5144,9 +5124,9 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_acq_rel_acquire_cmpxchg: @@ -5179,9 +5159,9 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_workgroup_acq_rel_acquire_cmpxchg: @@ -5308,9 +5288,9 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_workgroup_seq_cst_acquire_cmpxchg: @@ -5400,9 +5380,9 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_acquire_cmpxchg: @@ -5418,9 +5398,9 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_seq_cst_acquire_cmpxchg: @@ -5453,9 +5433,9 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_workgroup_seq_cst_acquire_cmpxchg: @@ -5582,9 +5562,9 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_workgroup_seq_cst_seq_cst_cmpxchg: @@ -5674,9 +5654,9 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_seq_cst_cmpxchg: @@ -5692,9 +5672,9 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_seq_cst_seq_cst_cmpxchg: @@ -5727,9 +5707,9 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_workgroup_seq_cst_seq_cst_cmpxchg: @@ -6141,10 +6121,9 @@ define amdgpu_kernel void @flat_workgroup_acquire_monotonic_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -6239,10 +6218,9 @@ define amdgpu_kernel void @flat_workgroup_acquire_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -6260,9 +6238,8 @@ define amdgpu_kernel void @flat_workgroup_acquire_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -6298,9 +6275,8 @@ define amdgpu_kernel void @flat_workgroup_acquire_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -6436,7 +6412,7 @@ define amdgpu_kernel void @flat_workgroup_release_monotonic_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -6535,7 +6511,7 @@ define amdgpu_kernel void @flat_workgroup_release_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 @@ -6556,7 +6532,7 @@ define amdgpu_kernel void @flat_workgroup_release_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -6594,7 +6570,7 @@ define amdgpu_kernel void @flat_workgroup_release_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -6737,12 +6713,11 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -6839,12 +6814,11 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -6861,11 +6835,10 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -6901,11 +6874,10 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -7052,12 +7024,11 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -7154,12 +7125,11 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -7176,11 +7146,10 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -7216,11 +7185,10 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -7368,10 +7336,9 @@ define amdgpu_kernel void @flat_workgroup_monotonic_acquire_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -7466,10 +7433,9 @@ define amdgpu_kernel void @flat_workgroup_monotonic_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -7487,9 +7453,8 @@ define amdgpu_kernel void @flat_workgroup_monotonic_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -7525,9 +7490,8 @@ define amdgpu_kernel void @flat_workgroup_monotonic_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -7666,10 +7630,9 @@ define amdgpu_kernel void @flat_workgroup_acquire_acquire_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -7764,10 +7727,9 @@ define amdgpu_kernel void @flat_workgroup_acquire_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -7785,9 +7747,8 @@ define amdgpu_kernel void @flat_workgroup_acquire_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -7823,9 +7784,8 @@ define amdgpu_kernel void @flat_workgroup_acquire_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -7961,12 +7921,11 @@ define amdgpu_kernel void @flat_workgroup_release_acquire_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -8063,12 +8022,11 @@ define amdgpu_kernel void @flat_workgroup_release_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -8085,11 +8043,10 @@ define amdgpu_kernel void @flat_workgroup_release_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -8125,11 +8082,10 @@ define amdgpu_kernel void @flat_workgroup_release_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -8276,12 +8232,11 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -8378,12 +8333,11 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -8400,11 +8354,10 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -8440,11 +8393,10 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -8591,12 +8543,11 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -8693,12 +8644,11 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -8715,11 +8665,10 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -8755,11 +8704,10 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -8906,12 +8854,11 @@ define amdgpu_kernel void @flat_workgroup_monotonic_seq_cst_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -9008,12 +8955,11 @@ define amdgpu_kernel void @flat_workgroup_monotonic_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -9030,11 +8976,10 @@ define amdgpu_kernel void @flat_workgroup_monotonic_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -9070,11 +9015,10 @@ define amdgpu_kernel void @flat_workgroup_monotonic_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -9221,12 +9165,11 @@ define amdgpu_kernel void @flat_workgroup_acquire_seq_cst_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -9323,12 +9266,11 @@ define amdgpu_kernel void @flat_workgroup_acquire_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -9345,11 +9287,10 @@ define amdgpu_kernel void @flat_workgroup_acquire_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -9385,11 +9326,10 @@ define amdgpu_kernel void @flat_workgroup_acquire_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -9534,12 +9474,11 @@ define amdgpu_kernel void @flat_workgroup_release_seq_cst_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -9636,12 +9575,11 @@ define amdgpu_kernel void @flat_workgroup_release_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -9658,11 +9596,10 @@ define amdgpu_kernel void @flat_workgroup_release_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -9698,11 +9635,10 @@ define amdgpu_kernel void @flat_workgroup_release_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -9849,12 +9785,11 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_seq_cst_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -9951,12 +9886,11 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -9973,11 +9907,10 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -10013,11 +9946,10 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -10164,12 +10096,11 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -10266,12 +10197,11 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -10288,11 +10218,10 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -10328,11 +10257,10 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -10829,9 +10757,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_load( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: flat_load_dword v2, v[0:1] +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -10881,9 +10810,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_load( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: flat_load_dword v2, v[0:1] +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -10896,8 +10826,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_load( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -10923,8 +10854,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_load( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX942-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -11018,10 +10950,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_load( ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_load_dword v2, v[0:1] +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -11072,10 +11006,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_load( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_load_dword v2, v[0:1] +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -11087,9 +11023,11 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_load( ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -11115,9 +11053,11 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_load( ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -11519,6 +11459,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_store( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -11562,6 +11503,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_store( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -11574,6 +11516,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_store( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -11597,6 +11540,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_store( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -11678,6 +11622,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_store( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -11721,6 +11666,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_store( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -11733,6 +11679,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_store( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -11756,6 +11703,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_store( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -11987,6 +11935,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_atomicrmw( ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_acquire_atomicrmw: @@ -12030,6 +11979,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_atomicrmw( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acquire_atomicrmw: @@ -12042,6 +11992,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_atomicrmw: @@ -12066,6 +12017,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_atomicrmw: @@ -12145,6 +12097,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_atomicrmw( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_swap v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -12188,6 +12141,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_atomicrmw( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -12200,6 +12154,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -12223,6 +12178,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -12304,7 +12260,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_atomicrmw( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_acq_rel_atomicrmw: @@ -12349,7 +12307,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_atomicrmw( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_atomicrmw: @@ -12361,7 +12321,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_atomicrmw: @@ -12386,7 +12348,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_atomicrmw: @@ -12473,7 +12437,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_atomicrmw( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_seq_cst_atomicrmw: @@ -12518,7 +12484,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_atomicrmw( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_atomicrmw: @@ -12530,7 +12498,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_atomicrmw: @@ -12555,7 +12525,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_atomicrmw: @@ -12643,9 +12615,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_ret_atomicrmw( ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s6 ; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -12698,9 +12671,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_ret_atomicrmw( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 ; SKIP-CACHE-INV-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -12714,8 +12688,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s6 ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -12743,8 +12718,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_ret_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -12844,10 +12820,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_ret_atomicrmw( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s6 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -12901,10 +12879,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_ret_atomicrmw( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -12917,9 +12897,11 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -12947,9 +12929,11 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_ret_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -13058,10 +13042,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_ret_atomicrmw( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s6 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -13115,10 +13101,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_ret_atomicrmw( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -13131,9 +13119,11 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -13161,9 +13151,11 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_ret_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -13525,6 +13517,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_acquire_monotonic_cmpxchg: @@ -13610,6 +13603,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acquire_monotonic_cmpxchg: @@ -13626,6 +13620,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_monotonic_cmpxchg: @@ -13658,6 +13653,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_monotonic_cmpxchg: @@ -13772,6 +13768,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm ; @@ -13857,6 +13854,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -13873,6 +13871,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -13904,6 +13903,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -14020,7 +14020,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_acq_rel_monotonic_cmpxchg: @@ -14107,7 +14109,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_monotonic_cmpxchg: @@ -14123,7 +14127,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_monotonic_cmpxchg: @@ -14156,7 +14162,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_monotonic_cmpxchg: @@ -14278,7 +14286,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_seq_cst_monotonic_cmpxchg: @@ -14365,7 +14375,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_monotonic_cmpxchg: @@ -14381,7 +14393,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_monotonic_cmpxchg: @@ -14414,7 +14428,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_monotonic_cmpxchg: @@ -14537,6 +14553,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_acquire_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_monotonic_acquire_cmpxchg: @@ -14622,6 +14639,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_monotonic_acquire_cmpxchg: @@ -14638,6 +14656,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_monotonic_acquire_cmpxchg: @@ -14670,6 +14689,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_workgroup_one_as_monotonic_acquire_cmpxchg: @@ -14785,6 +14805,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_acquire_acquire_cmpxchg: @@ -14870,6 +14891,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acquire_acquire_cmpxchg: @@ -14886,6 +14908,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_acquire_cmpxchg: @@ -14918,6 +14941,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_acquire_cmpxchg: @@ -15032,7 +15056,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_release_acquire_cmpxchg: @@ -15119,7 +15145,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_release_acquire_cmpxchg: @@ -15135,7 +15163,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_release_acquire_cmpxchg: @@ -15168,7 +15198,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_workgroup_one_as_release_acquire_cmpxchg: @@ -15290,7 +15322,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_acq_rel_acquire_cmpxchg: @@ -15377,7 +15411,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_acquire_cmpxchg: @@ -15393,7 +15429,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_acquire_cmpxchg: @@ -15426,7 +15464,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_acquire_cmpxchg: @@ -15548,7 +15588,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_seq_cst_acquire_cmpxchg: @@ -15635,7 +15677,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_acquire_cmpxchg: @@ -15651,7 +15695,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_acquire_cmpxchg: @@ -15684,7 +15730,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_acquire_cmpxchg: @@ -15806,7 +15854,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_monotonic_seq_cst_cmpxchg: @@ -15893,7 +15943,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_monotonic_seq_cst_cmpxchg: @@ -15909,7 +15961,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_monotonic_seq_cst_cmpxchg: @@ -15942,7 +15996,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_workgroup_one_as_monotonic_seq_cst_cmpxchg: @@ -16064,7 +16120,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_acquire_seq_cst_cmpxchg: @@ -16151,7 +16209,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acquire_seq_cst_cmpxchg: @@ -16167,7 +16227,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_seq_cst_cmpxchg: @@ -16200,7 +16262,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_seq_cst_cmpxchg: @@ -16322,7 +16386,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_release_seq_cst_cmpxchg: @@ -16409,7 +16475,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_release_seq_cst_cmpxchg: @@ -16425,7 +16493,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_release_seq_cst_cmpxchg: @@ -16458,7 +16528,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_workgroup_one_as_release_seq_cst_cmpxchg: @@ -16580,7 +16652,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg: @@ -16667,7 +16741,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg: @@ -16683,7 +16759,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg: @@ -16716,7 +16794,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg: @@ -16838,7 +16918,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg: @@ -16925,7 +17007,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg: @@ -16941,7 +17025,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg: @@ -16974,7 +17060,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg: @@ -17381,9 +17469,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -17478,9 +17567,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -17498,8 +17588,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -17535,8 +17626,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -17672,6 +17764,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -17769,6 +17862,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 @@ -17789,6 +17883,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -17826,6 +17921,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -17966,10 +18062,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -18065,10 +18163,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -18085,9 +18185,11 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -18123,9 +18225,11 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -18270,10 +18374,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -18369,10 +18475,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -18389,9 +18497,11 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -18427,9 +18537,11 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -18575,9 +18687,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -18672,9 +18785,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -18692,8 +18806,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -18729,8 +18844,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -18869,9 +18985,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -18966,9 +19083,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -18986,8 +19104,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -19023,8 +19142,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -19160,10 +19280,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -19259,10 +19381,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -19279,9 +19403,11 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -19317,9 +19443,11 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -19464,10 +19592,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -19563,10 +19693,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -19583,9 +19715,11 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -19621,9 +19755,11 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -19768,10 +19904,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -19867,10 +20005,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -19887,9 +20027,11 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -19925,9 +20067,11 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -20072,10 +20216,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -20171,10 +20317,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -20191,9 +20339,11 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -20229,9 +20379,11 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -20376,10 +20528,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -20475,10 +20629,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -20495,9 +20651,11 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -20533,9 +20691,11 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -20678,10 +20838,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -20777,10 +20939,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -20797,9 +20961,11 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -20835,9 +21001,11 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -20982,10 +21150,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -21081,10 +21251,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -21101,9 +21273,11 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -21139,9 +21313,11 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -21286,10 +21462,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -21385,10 +21563,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -21405,9 +21585,11 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -21443,9 +21625,11 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-singlethread.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-singlethread.ll index 8042d38716107..61ccefd35ec16 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-singlethread.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-singlethread.ll @@ -586,6 +586,7 @@ define amdgpu_kernel void @global_singlethread_seq_cst_load( ; GFX6-NEXT: s_mov_b32 s5, s14 ; GFX6-NEXT: s_mov_b32 s6, s13 ; GFX6-NEXT: s_mov_b32 s7, s12 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_load_dword v0, off, s[8:11], 0 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 @@ -601,6 +602,7 @@ define amdgpu_kernel void @global_singlethread_seq_cst_load( ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_load_dword v2, v[0:1] ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -650,6 +652,7 @@ define amdgpu_kernel void @global_singlethread_seq_cst_load( ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s10 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s9 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s8 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -660,7 +663,7 @@ define amdgpu_kernel void @global_singlethread_seq_cst_load( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] @@ -682,7 +685,7 @@ define amdgpu_kernel void @global_singlethread_seq_cst_load( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[2:3] ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] @@ -1077,6 +1080,7 @@ define amdgpu_kernel void @global_singlethread_release_store( ; GFX6-NEXT: s_mov_b32 s6, s10 ; GFX6-NEXT: s_mov_b32 s7, s9 ; GFX6-NEXT: v_mov_b32_e32 v0, s8 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; @@ -1091,6 +1095,7 @@ define amdgpu_kernel void @global_singlethread_release_store( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -1129,6 +1134,7 @@ define amdgpu_kernel void @global_singlethread_release_store( ; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s6 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s5 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -1139,6 +1145,7 @@ define amdgpu_kernel void @global_singlethread_release_store( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -1159,6 +1166,7 @@ define amdgpu_kernel void @global_singlethread_release_store( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -1233,6 +1241,7 @@ define amdgpu_kernel void @global_singlethread_seq_cst_store( ; GFX6-NEXT: s_mov_b32 s6, s10 ; GFX6-NEXT: s_mov_b32 s7, s9 ; GFX6-NEXT: v_mov_b32_e32 v0, s8 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; @@ -1247,6 +1256,7 @@ define amdgpu_kernel void @global_singlethread_seq_cst_store( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -1285,6 +1295,7 @@ define amdgpu_kernel void @global_singlethread_seq_cst_store( ; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s6 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s5 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -1295,6 +1306,7 @@ define amdgpu_kernel void @global_singlethread_seq_cst_store( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -1315,6 +1327,7 @@ define amdgpu_kernel void @global_singlethread_seq_cst_store( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -1696,6 +1709,7 @@ define amdgpu_kernel void @global_singlethread_release_atomicrmw( ; GFX6-NEXT: s_mov_b32 s6, s10 ; GFX6-NEXT: s_mov_b32 s7, s9 ; GFX6-NEXT: v_mov_b32_e32 v0, s8 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; @@ -1710,6 +1724,7 @@ define amdgpu_kernel void @global_singlethread_release_atomicrmw( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_swap v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -1747,6 +1762,7 @@ define amdgpu_kernel void @global_singlethread_release_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s6 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s5 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -1757,6 +1773,7 @@ define amdgpu_kernel void @global_singlethread_release_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[4:5] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -1777,6 +1794,7 @@ define amdgpu_kernel void @global_singlethread_release_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -1850,6 +1868,7 @@ define amdgpu_kernel void @global_singlethread_acq_rel_atomicrmw( ; GFX6-NEXT: s_mov_b32 s6, s10 ; GFX6-NEXT: s_mov_b32 s7, s9 ; GFX6-NEXT: v_mov_b32_e32 v0, s8 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; @@ -1864,6 +1883,7 @@ define amdgpu_kernel void @global_singlethread_acq_rel_atomicrmw( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_swap v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -1901,6 +1921,7 @@ define amdgpu_kernel void @global_singlethread_acq_rel_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s6 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s5 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -1911,6 +1932,7 @@ define amdgpu_kernel void @global_singlethread_acq_rel_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[4:5] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -1931,6 +1953,7 @@ define amdgpu_kernel void @global_singlethread_acq_rel_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -2004,6 +2027,7 @@ define amdgpu_kernel void @global_singlethread_seq_cst_atomicrmw( ; GFX6-NEXT: s_mov_b32 s6, s10 ; GFX6-NEXT: s_mov_b32 s7, s9 ; GFX6-NEXT: v_mov_b32_e32 v0, s8 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; @@ -2018,6 +2042,7 @@ define amdgpu_kernel void @global_singlethread_seq_cst_atomicrmw( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_swap v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -2055,6 +2080,7 @@ define amdgpu_kernel void @global_singlethread_seq_cst_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s6 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s5 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -2065,6 +2091,7 @@ define amdgpu_kernel void @global_singlethread_seq_cst_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[4:5] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -2085,6 +2112,7 @@ define amdgpu_kernel void @global_singlethread_seq_cst_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -2341,6 +2369,7 @@ define amdgpu_kernel void @global_singlethread_acq_rel_ret_atomicrmw( ; GFX6-NEXT: s_mov_b32 s6, s10 ; GFX6-NEXT: s_mov_b32 s7, s9 ; GFX6-NEXT: v_mov_b32_e32 v0, s8 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 @@ -2357,6 +2386,7 @@ define amdgpu_kernel void @global_singlethread_acq_rel_ret_atomicrmw( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s6 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -2402,6 +2432,7 @@ define amdgpu_kernel void @global_singlethread_acq_rel_ret_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s6 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s5 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -2414,6 +2445,7 @@ define amdgpu_kernel void @global_singlethread_acq_rel_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[4:5] glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] @@ -2438,6 +2470,7 @@ define amdgpu_kernel void @global_singlethread_acq_rel_ret_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[0:1] sc0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] @@ -2524,6 +2557,7 @@ define amdgpu_kernel void @global_singlethread_seq_cst_ret_atomicrmw( ; GFX6-NEXT: s_mov_b32 s6, s10 ; GFX6-NEXT: s_mov_b32 s7, s9 ; GFX6-NEXT: v_mov_b32_e32 v0, s8 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 @@ -2540,6 +2574,7 @@ define amdgpu_kernel void @global_singlethread_seq_cst_ret_atomicrmw( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s6 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -2585,6 +2620,7 @@ define amdgpu_kernel void @global_singlethread_seq_cst_ret_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s6 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s5 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -2597,6 +2633,7 @@ define amdgpu_kernel void @global_singlethread_seq_cst_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[4:5] glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] @@ -2621,6 +2658,7 @@ define amdgpu_kernel void @global_singlethread_seq_cst_ret_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[0:1] sc0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] @@ -3150,6 +3188,7 @@ define amdgpu_kernel void @global_singlethread_release_monotonic_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; GFX6-NEXT: s_endpgm ; @@ -3178,6 +3217,7 @@ define amdgpu_kernel void @global_singlethread_release_monotonic_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm ; @@ -3228,6 +3268,7 @@ define amdgpu_kernel void @global_singlethread_release_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -3242,6 +3283,7 @@ define amdgpu_kernel void @global_singlethread_release_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -3270,6 +3312,7 @@ define amdgpu_kernel void @global_singlethread_release_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -3369,6 +3412,7 @@ define amdgpu_kernel void @global_singlethread_acq_rel_monotonic_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; GFX6-NEXT: s_endpgm ; @@ -3397,6 +3441,7 @@ define amdgpu_kernel void @global_singlethread_acq_rel_monotonic_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm ; @@ -3447,6 +3492,7 @@ define amdgpu_kernel void @global_singlethread_acq_rel_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -3461,6 +3507,7 @@ define amdgpu_kernel void @global_singlethread_acq_rel_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -3489,6 +3536,7 @@ define amdgpu_kernel void @global_singlethread_acq_rel_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -3588,6 +3636,7 @@ define amdgpu_kernel void @global_singlethread_seq_cst_monotonic_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; GFX6-NEXT: s_endpgm ; @@ -3616,6 +3665,7 @@ define amdgpu_kernel void @global_singlethread_seq_cst_monotonic_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm ; @@ -3666,6 +3716,7 @@ define amdgpu_kernel void @global_singlethread_seq_cst_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -3680,6 +3731,7 @@ define amdgpu_kernel void @global_singlethread_seq_cst_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -3708,6 +3760,7 @@ define amdgpu_kernel void @global_singlethread_seq_cst_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -4245,6 +4298,7 @@ define amdgpu_kernel void @global_singlethread_release_acquire_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; GFX6-NEXT: s_endpgm ; @@ -4273,6 +4327,7 @@ define amdgpu_kernel void @global_singlethread_release_acquire_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm ; @@ -4323,6 +4378,7 @@ define amdgpu_kernel void @global_singlethread_release_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -4337,6 +4393,7 @@ define amdgpu_kernel void @global_singlethread_release_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -4365,6 +4422,7 @@ define amdgpu_kernel void @global_singlethread_release_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -4464,6 +4522,7 @@ define amdgpu_kernel void @global_singlethread_acq_rel_acquire_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; GFX6-NEXT: s_endpgm ; @@ -4492,6 +4551,7 @@ define amdgpu_kernel void @global_singlethread_acq_rel_acquire_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm ; @@ -4542,6 +4602,7 @@ define amdgpu_kernel void @global_singlethread_acq_rel_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -4556,6 +4617,7 @@ define amdgpu_kernel void @global_singlethread_acq_rel_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -4584,6 +4646,7 @@ define amdgpu_kernel void @global_singlethread_acq_rel_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -4683,6 +4746,7 @@ define amdgpu_kernel void @global_singlethread_seq_cst_acquire_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; GFX6-NEXT: s_endpgm ; @@ -4711,6 +4775,7 @@ define amdgpu_kernel void @global_singlethread_seq_cst_acquire_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm ; @@ -4761,6 +4826,7 @@ define amdgpu_kernel void @global_singlethread_seq_cst_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -4775,6 +4841,7 @@ define amdgpu_kernel void @global_singlethread_seq_cst_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -4803,6 +4870,7 @@ define amdgpu_kernel void @global_singlethread_seq_cst_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -4902,6 +4970,7 @@ define amdgpu_kernel void @global_singlethread_monotonic_seq_cst_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; GFX6-NEXT: s_endpgm ; @@ -4930,6 +4999,7 @@ define amdgpu_kernel void @global_singlethread_monotonic_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm ; @@ -4980,6 +5050,7 @@ define amdgpu_kernel void @global_singlethread_monotonic_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -4994,6 +5065,7 @@ define amdgpu_kernel void @global_singlethread_monotonic_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -5022,6 +5094,7 @@ define amdgpu_kernel void @global_singlethread_monotonic_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -5121,6 +5194,7 @@ define amdgpu_kernel void @global_singlethread_acquire_seq_cst_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; GFX6-NEXT: s_endpgm ; @@ -5149,6 +5223,7 @@ define amdgpu_kernel void @global_singlethread_acquire_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm ; @@ -5199,6 +5274,7 @@ define amdgpu_kernel void @global_singlethread_acquire_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -5213,6 +5289,7 @@ define amdgpu_kernel void @global_singlethread_acquire_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -5241,6 +5318,7 @@ define amdgpu_kernel void @global_singlethread_acquire_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -5340,6 +5418,7 @@ define amdgpu_kernel void @global_singlethread_release_seq_cst_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; GFX6-NEXT: s_endpgm ; @@ -5368,6 +5447,7 @@ define amdgpu_kernel void @global_singlethread_release_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm ; @@ -5418,6 +5498,7 @@ define amdgpu_kernel void @global_singlethread_release_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -5432,6 +5513,7 @@ define amdgpu_kernel void @global_singlethread_release_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -5460,6 +5542,7 @@ define amdgpu_kernel void @global_singlethread_release_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -5559,6 +5642,7 @@ define amdgpu_kernel void @global_singlethread_acq_rel_seq_cst_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; GFX6-NEXT: s_endpgm ; @@ -5587,6 +5671,7 @@ define amdgpu_kernel void @global_singlethread_acq_rel_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm ; @@ -5637,6 +5722,7 @@ define amdgpu_kernel void @global_singlethread_acq_rel_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -5651,6 +5737,7 @@ define amdgpu_kernel void @global_singlethread_acq_rel_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -5679,6 +5766,7 @@ define amdgpu_kernel void @global_singlethread_acq_rel_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -5778,6 +5866,7 @@ define amdgpu_kernel void @global_singlethread_seq_cst_seq_cst_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; GFX6-NEXT: s_endpgm ; @@ -5806,6 +5895,7 @@ define amdgpu_kernel void @global_singlethread_seq_cst_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm ; @@ -5856,6 +5946,7 @@ define amdgpu_kernel void @global_singlethread_seq_cst_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -5870,6 +5961,7 @@ define amdgpu_kernel void @global_singlethread_seq_cst_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -5898,6 +5990,7 @@ define amdgpu_kernel void @global_singlethread_seq_cst_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -6499,6 +6592,7 @@ define amdgpu_kernel void @global_singlethread_release_monotonic_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -6530,6 +6624,7 @@ define amdgpu_kernel void @global_singlethread_release_monotonic_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -6588,6 +6683,7 @@ define amdgpu_kernel void @global_singlethread_release_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) @@ -6605,6 +6701,7 @@ define amdgpu_kernel void @global_singlethread_release_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] @@ -6637,6 +6734,7 @@ define amdgpu_kernel void @global_singlethread_release_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] @@ -6750,6 +6848,7 @@ define amdgpu_kernel void @global_singlethread_acq_rel_monotonic_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -6781,6 +6880,7 @@ define amdgpu_kernel void @global_singlethread_acq_rel_monotonic_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -6839,6 +6939,7 @@ define amdgpu_kernel void @global_singlethread_acq_rel_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) @@ -6856,6 +6957,7 @@ define amdgpu_kernel void @global_singlethread_acq_rel_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] @@ -6888,6 +6990,7 @@ define amdgpu_kernel void @global_singlethread_acq_rel_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] @@ -7001,6 +7104,7 @@ define amdgpu_kernel void @global_singlethread_seq_cst_monotonic_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -7032,6 +7136,7 @@ define amdgpu_kernel void @global_singlethread_seq_cst_monotonic_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -7090,6 +7195,7 @@ define amdgpu_kernel void @global_singlethread_seq_cst_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) @@ -7107,6 +7213,7 @@ define amdgpu_kernel void @global_singlethread_seq_cst_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] @@ -7139,6 +7246,7 @@ define amdgpu_kernel void @global_singlethread_seq_cst_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] @@ -7754,6 +7862,7 @@ define amdgpu_kernel void @global_singlethread_release_acquire_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -7785,6 +7894,7 @@ define amdgpu_kernel void @global_singlethread_release_acquire_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -7843,6 +7953,7 @@ define amdgpu_kernel void @global_singlethread_release_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) @@ -7860,6 +7971,7 @@ define amdgpu_kernel void @global_singlethread_release_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] @@ -7892,6 +8004,7 @@ define amdgpu_kernel void @global_singlethread_release_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] @@ -8005,6 +8118,7 @@ define amdgpu_kernel void @global_singlethread_acq_rel_acquire_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -8036,6 +8150,7 @@ define amdgpu_kernel void @global_singlethread_acq_rel_acquire_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -8094,6 +8209,7 @@ define amdgpu_kernel void @global_singlethread_acq_rel_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) @@ -8111,6 +8227,7 @@ define amdgpu_kernel void @global_singlethread_acq_rel_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] @@ -8143,6 +8260,7 @@ define amdgpu_kernel void @global_singlethread_acq_rel_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] @@ -8256,6 +8374,7 @@ define amdgpu_kernel void @global_singlethread_seq_cst_acquire_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -8287,6 +8406,7 @@ define amdgpu_kernel void @global_singlethread_seq_cst_acquire_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -8345,6 +8465,7 @@ define amdgpu_kernel void @global_singlethread_seq_cst_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) @@ -8362,6 +8483,7 @@ define amdgpu_kernel void @global_singlethread_seq_cst_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] @@ -8394,6 +8516,7 @@ define amdgpu_kernel void @global_singlethread_seq_cst_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] @@ -8507,6 +8630,7 @@ define amdgpu_kernel void @global_singlethread_monotonic_seq_cst_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -8538,6 +8662,7 @@ define amdgpu_kernel void @global_singlethread_monotonic_seq_cst_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -8596,6 +8721,7 @@ define amdgpu_kernel void @global_singlethread_monotonic_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) @@ -8613,6 +8739,7 @@ define amdgpu_kernel void @global_singlethread_monotonic_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] @@ -8645,6 +8772,7 @@ define amdgpu_kernel void @global_singlethread_monotonic_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] @@ -8758,6 +8886,7 @@ define amdgpu_kernel void @global_singlethread_acquire_seq_cst_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -8789,6 +8918,7 @@ define amdgpu_kernel void @global_singlethread_acquire_seq_cst_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -8847,6 +8977,7 @@ define amdgpu_kernel void @global_singlethread_acquire_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) @@ -8864,6 +8995,7 @@ define amdgpu_kernel void @global_singlethread_acquire_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] @@ -8896,6 +9028,7 @@ define amdgpu_kernel void @global_singlethread_acquire_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] @@ -9009,6 +9142,7 @@ define amdgpu_kernel void @global_singlethread_release_seq_cst_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -9040,6 +9174,7 @@ define amdgpu_kernel void @global_singlethread_release_seq_cst_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -9098,6 +9233,7 @@ define amdgpu_kernel void @global_singlethread_release_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) @@ -9115,6 +9251,7 @@ define amdgpu_kernel void @global_singlethread_release_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] @@ -9147,6 +9284,7 @@ define amdgpu_kernel void @global_singlethread_release_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] @@ -9260,6 +9398,7 @@ define amdgpu_kernel void @global_singlethread_acq_rel_seq_cst_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -9291,6 +9430,7 @@ define amdgpu_kernel void @global_singlethread_acq_rel_seq_cst_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -9349,6 +9489,7 @@ define amdgpu_kernel void @global_singlethread_acq_rel_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) @@ -9366,6 +9507,7 @@ define amdgpu_kernel void @global_singlethread_acq_rel_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] @@ -9398,6 +9540,7 @@ define amdgpu_kernel void @global_singlethread_acq_rel_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] @@ -9511,6 +9654,7 @@ define amdgpu_kernel void @global_singlethread_seq_cst_seq_cst_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -9542,6 +9686,7 @@ define amdgpu_kernel void @global_singlethread_seq_cst_seq_cst_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -9600,6 +9745,7 @@ define amdgpu_kernel void @global_singlethread_seq_cst_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) @@ -9617,6 +9763,7 @@ define amdgpu_kernel void @global_singlethread_seq_cst_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] @@ -9649,6 +9796,7 @@ define amdgpu_kernel void @global_singlethread_seq_cst_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] @@ -10315,6 +10463,7 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_load( ; GFX6-NEXT: s_mov_b32 s5, s14 ; GFX6-NEXT: s_mov_b32 s6, s13 ; GFX6-NEXT: s_mov_b32 s7, s12 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_load_dword v0, off, s[8:11], 0 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 @@ -10330,6 +10479,7 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_load( ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_load_dword v2, v[0:1] ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -10379,6 +10529,7 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_load( ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s10 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s9 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s8 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -10389,7 +10540,7 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_load( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] @@ -10411,7 +10562,7 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_load( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[2:3] ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] @@ -10806,6 +10957,7 @@ define amdgpu_kernel void @global_singlethread_one_as_release_store( ; GFX6-NEXT: s_mov_b32 s6, s10 ; GFX6-NEXT: s_mov_b32 s7, s9 ; GFX6-NEXT: v_mov_b32_e32 v0, s8 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; @@ -10820,6 +10972,7 @@ define amdgpu_kernel void @global_singlethread_one_as_release_store( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -10858,6 +11011,7 @@ define amdgpu_kernel void @global_singlethread_one_as_release_store( ; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s6 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s5 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -10868,6 +11022,7 @@ define amdgpu_kernel void @global_singlethread_one_as_release_store( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -10888,6 +11043,7 @@ define amdgpu_kernel void @global_singlethread_one_as_release_store( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -10962,6 +11118,7 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_store( ; GFX6-NEXT: s_mov_b32 s6, s10 ; GFX6-NEXT: s_mov_b32 s7, s9 ; GFX6-NEXT: v_mov_b32_e32 v0, s8 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; @@ -10976,6 +11133,7 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_store( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -11014,6 +11172,7 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_store( ; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s6 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s5 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -11024,6 +11183,7 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_store( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -11044,6 +11204,7 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_store( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -11425,6 +11586,7 @@ define amdgpu_kernel void @global_singlethread_one_as_release_atomicrmw( ; GFX6-NEXT: s_mov_b32 s6, s10 ; GFX6-NEXT: s_mov_b32 s7, s9 ; GFX6-NEXT: v_mov_b32_e32 v0, s8 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; @@ -11439,6 +11601,7 @@ define amdgpu_kernel void @global_singlethread_one_as_release_atomicrmw( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_swap v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -11476,6 +11639,7 @@ define amdgpu_kernel void @global_singlethread_one_as_release_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s6 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s5 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -11486,6 +11650,7 @@ define amdgpu_kernel void @global_singlethread_one_as_release_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[4:5] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -11506,6 +11671,7 @@ define amdgpu_kernel void @global_singlethread_one_as_release_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -11579,6 +11745,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_atomicrmw( ; GFX6-NEXT: s_mov_b32 s6, s10 ; GFX6-NEXT: s_mov_b32 s7, s9 ; GFX6-NEXT: v_mov_b32_e32 v0, s8 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; @@ -11593,6 +11760,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_atomicrmw( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_swap v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -11630,6 +11798,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s6 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s5 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -11640,6 +11809,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[4:5] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -11660,6 +11830,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -11733,6 +11904,7 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_atomicrmw( ; GFX6-NEXT: s_mov_b32 s6, s10 ; GFX6-NEXT: s_mov_b32 s7, s9 ; GFX6-NEXT: v_mov_b32_e32 v0, s8 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; @@ -11747,6 +11919,7 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_atomicrmw( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_swap v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -11784,6 +11957,7 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s6 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s5 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -11794,6 +11968,7 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[4:5] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -11814,6 +11989,7 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -12070,6 +12246,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_ret_atomicrmw( ; GFX6-NEXT: s_mov_b32 s6, s10 ; GFX6-NEXT: s_mov_b32 s7, s9 ; GFX6-NEXT: v_mov_b32_e32 v0, s8 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 @@ -12086,6 +12263,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_ret_atomicrmw( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s6 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -12131,6 +12309,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_ret_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s6 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s5 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -12143,6 +12322,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[4:5] glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] @@ -12167,6 +12347,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_ret_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[0:1] sc0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] @@ -12253,6 +12434,7 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_ret_atomicrmw( ; GFX6-NEXT: s_mov_b32 s6, s10 ; GFX6-NEXT: s_mov_b32 s7, s9 ; GFX6-NEXT: v_mov_b32_e32 v0, s8 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 @@ -12269,6 +12451,7 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_ret_atomicrmw( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s6 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -12314,6 +12497,7 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_ret_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s6 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s5 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -12326,6 +12510,7 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[4:5] glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] @@ -12350,6 +12535,7 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_ret_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[0:1] sc0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] @@ -12879,6 +13065,7 @@ define amdgpu_kernel void @global_singlethread_one_as_release_monotonic_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; GFX6-NEXT: s_endpgm ; @@ -12907,6 +13094,7 @@ define amdgpu_kernel void @global_singlethread_one_as_release_monotonic_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm ; @@ -12957,6 +13145,7 @@ define amdgpu_kernel void @global_singlethread_one_as_release_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -12971,6 +13160,7 @@ define amdgpu_kernel void @global_singlethread_one_as_release_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -12999,6 +13189,7 @@ define amdgpu_kernel void @global_singlethread_one_as_release_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -13098,6 +13289,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_monotonic_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; GFX6-NEXT: s_endpgm ; @@ -13126,6 +13318,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_monotonic_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm ; @@ -13176,6 +13369,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -13190,6 +13384,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -13218,6 +13413,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -13317,6 +13513,7 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_monotonic_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; GFX6-NEXT: s_endpgm ; @@ -13345,6 +13542,7 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_monotonic_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm ; @@ -13395,6 +13593,7 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -13409,6 +13608,7 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -13437,6 +13637,7 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -13974,6 +14175,7 @@ define amdgpu_kernel void @global_singlethread_one_as_release_acquire_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; GFX6-NEXT: s_endpgm ; @@ -14002,6 +14204,7 @@ define amdgpu_kernel void @global_singlethread_one_as_release_acquire_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm ; @@ -14052,6 +14255,7 @@ define amdgpu_kernel void @global_singlethread_one_as_release_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -14066,6 +14270,7 @@ define amdgpu_kernel void @global_singlethread_one_as_release_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -14094,6 +14299,7 @@ define amdgpu_kernel void @global_singlethread_one_as_release_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -14193,6 +14399,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_acquire_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; GFX6-NEXT: s_endpgm ; @@ -14221,6 +14428,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_acquire_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm ; @@ -14271,6 +14479,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -14285,6 +14494,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -14313,6 +14523,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -14412,6 +14623,7 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_acquire_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; GFX6-NEXT: s_endpgm ; @@ -14440,6 +14652,7 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_acquire_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm ; @@ -14490,6 +14703,7 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -14504,6 +14718,7 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -14532,6 +14747,7 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -14631,6 +14847,7 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_seq_cst_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; GFX6-NEXT: s_endpgm ; @@ -14659,6 +14876,7 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm ; @@ -14709,6 +14927,7 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -14723,6 +14942,7 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -14751,6 +14971,7 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -14850,6 +15071,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_seq_cst_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; GFX6-NEXT: s_endpgm ; @@ -14878,6 +15100,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm ; @@ -14928,6 +15151,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -14942,6 +15166,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -14970,6 +15195,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -15069,6 +15295,7 @@ define amdgpu_kernel void @global_singlethread_one_as_release_seq_cst_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; GFX6-NEXT: s_endpgm ; @@ -15097,6 +15324,7 @@ define amdgpu_kernel void @global_singlethread_one_as_release_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm ; @@ -15147,6 +15375,7 @@ define amdgpu_kernel void @global_singlethread_one_as_release_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -15161,6 +15390,7 @@ define amdgpu_kernel void @global_singlethread_one_as_release_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -15189,6 +15419,7 @@ define amdgpu_kernel void @global_singlethread_one_as_release_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -15288,6 +15519,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_seq_cst_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; GFX6-NEXT: s_endpgm ; @@ -15316,6 +15548,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm ; @@ -15366,6 +15599,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -15380,6 +15614,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -15408,6 +15643,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -15507,6 +15743,7 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_seq_cst_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; GFX6-NEXT: s_endpgm ; @@ -15535,6 +15772,7 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm ; @@ -15585,6 +15823,7 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -15599,6 +15838,7 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -15627,6 +15867,7 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -16228,6 +16469,7 @@ define amdgpu_kernel void @global_singlethread_one_as_release_monotonic_ret_cmpx ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -16259,6 +16501,7 @@ define amdgpu_kernel void @global_singlethread_one_as_release_monotonic_ret_cmpx ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -16317,6 +16560,7 @@ define amdgpu_kernel void @global_singlethread_one_as_release_monotonic_ret_cmpx ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) @@ -16334,6 +16578,7 @@ define amdgpu_kernel void @global_singlethread_one_as_release_monotonic_ret_cmpx ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] @@ -16366,6 +16611,7 @@ define amdgpu_kernel void @global_singlethread_one_as_release_monotonic_ret_cmpx ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] @@ -16479,6 +16725,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_monotonic_ret_cmpx ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -16510,6 +16757,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_monotonic_ret_cmpx ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -16568,6 +16816,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_monotonic_ret_cmpx ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) @@ -16585,6 +16834,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_monotonic_ret_cmpx ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] @@ -16617,6 +16867,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_monotonic_ret_cmpx ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] @@ -16730,6 +16981,7 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_monotonic_ret_cmpx ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -16761,6 +17013,7 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_monotonic_ret_cmpx ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -16819,6 +17072,7 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_monotonic_ret_cmpx ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) @@ -16836,6 +17090,7 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_monotonic_ret_cmpx ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] @@ -16868,6 +17123,7 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_monotonic_ret_cmpx ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] @@ -17483,6 +17739,7 @@ define amdgpu_kernel void @global_singlethread_one_as_release_acquire_ret_cmpxch ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -17514,6 +17771,7 @@ define amdgpu_kernel void @global_singlethread_one_as_release_acquire_ret_cmpxch ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -17572,6 +17830,7 @@ define amdgpu_kernel void @global_singlethread_one_as_release_acquire_ret_cmpxch ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) @@ -17589,6 +17848,7 @@ define amdgpu_kernel void @global_singlethread_one_as_release_acquire_ret_cmpxch ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] @@ -17621,6 +17881,7 @@ define amdgpu_kernel void @global_singlethread_one_as_release_acquire_ret_cmpxch ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] @@ -17734,6 +17995,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_acquire_ret_cmpxch ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -17765,6 +18027,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_acquire_ret_cmpxch ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -17823,6 +18086,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_acquire_ret_cmpxch ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) @@ -17840,6 +18104,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_acquire_ret_cmpxch ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] @@ -17872,6 +18137,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_acquire_ret_cmpxch ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] @@ -17985,6 +18251,7 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_acquire_ret_cmpxch ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -18016,6 +18283,7 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_acquire_ret_cmpxch ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -18074,6 +18342,7 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_acquire_ret_cmpxch ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) @@ -18091,6 +18360,7 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_acquire_ret_cmpxch ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] @@ -18123,6 +18393,7 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_acquire_ret_cmpxch ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] @@ -18236,6 +18507,7 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_seq_cst_ret_cmpx ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -18267,6 +18539,7 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_seq_cst_ret_cmpx ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -18325,6 +18598,7 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_seq_cst_ret_cmpx ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) @@ -18342,6 +18616,7 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_seq_cst_ret_cmpx ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] @@ -18374,6 +18649,7 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_seq_cst_ret_cmpx ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] @@ -18487,6 +18763,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_seq_cst_ret_cmpxch ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -18518,6 +18795,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_seq_cst_ret_cmpxch ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -18576,6 +18854,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_seq_cst_ret_cmpxch ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) @@ -18593,6 +18872,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_seq_cst_ret_cmpxch ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] @@ -18625,6 +18905,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_seq_cst_ret_cmpxch ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] @@ -18738,6 +19019,7 @@ define amdgpu_kernel void @global_singlethread_one_as_release_seq_cst_ret_cmpxch ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -18769,6 +19051,7 @@ define amdgpu_kernel void @global_singlethread_one_as_release_seq_cst_ret_cmpxch ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -18827,6 +19110,7 @@ define amdgpu_kernel void @global_singlethread_one_as_release_seq_cst_ret_cmpxch ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) @@ -18844,6 +19128,7 @@ define amdgpu_kernel void @global_singlethread_one_as_release_seq_cst_ret_cmpxch ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] @@ -18876,6 +19161,7 @@ define amdgpu_kernel void @global_singlethread_one_as_release_seq_cst_ret_cmpxch ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] @@ -18989,6 +19275,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_seq_cst_ret_cmpxch ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -19020,6 +19307,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_seq_cst_ret_cmpxch ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -19078,6 +19366,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_seq_cst_ret_cmpxch ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) @@ -19095,6 +19384,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_seq_cst_ret_cmpxch ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] @@ -19127,6 +19417,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_seq_cst_ret_cmpxch ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] @@ -19240,6 +19531,7 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_seq_cst_ret_cmpxch ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -19271,6 +19563,7 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_seq_cst_ret_cmpxch ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -19329,6 +19622,7 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_seq_cst_ret_cmpxch ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) @@ -19346,6 +19640,7 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_seq_cst_ret_cmpxch ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] @@ -19378,6 +19673,7 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_seq_cst_ret_cmpxch ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-volatile.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-volatile.ll index 8a5c5dda9f79c..27c2cc6b2d63f 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-volatile.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-volatile.ll @@ -861,7 +861,7 @@ define amdgpu_kernel void @global_volatile_workgroup_release_store( ; GFX6-NEXT: s_mov_b32 s2, s6 ; GFX6-NEXT: s_mov_b32 s3, s5 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; @@ -876,7 +876,7 @@ define amdgpu_kernel void @global_volatile_workgroup_release_store( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -918,7 +918,7 @@ define amdgpu_kernel void @global_volatile_workgroup_release_store( ; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s6 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s5 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-wavefront.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-wavefront.ll index 151ba07a0b531..4d76032f0255d 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-wavefront.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-wavefront.ll @@ -586,6 +586,7 @@ define amdgpu_kernel void @global_wavefront_seq_cst_load( ; GFX6-NEXT: s_mov_b32 s5, s14 ; GFX6-NEXT: s_mov_b32 s6, s13 ; GFX6-NEXT: s_mov_b32 s7, s12 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_load_dword v0, off, s[8:11], 0 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 @@ -601,6 +602,7 @@ define amdgpu_kernel void @global_wavefront_seq_cst_load( ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_load_dword v2, v[0:1] ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -650,6 +652,7 @@ define amdgpu_kernel void @global_wavefront_seq_cst_load( ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s10 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s9 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s8 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -660,7 +663,7 @@ define amdgpu_kernel void @global_wavefront_seq_cst_load( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] @@ -682,7 +685,7 @@ define amdgpu_kernel void @global_wavefront_seq_cst_load( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[2:3] ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] @@ -1077,6 +1080,7 @@ define amdgpu_kernel void @global_wavefront_release_store( ; GFX6-NEXT: s_mov_b32 s6, s10 ; GFX6-NEXT: s_mov_b32 s7, s9 ; GFX6-NEXT: v_mov_b32_e32 v0, s8 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; @@ -1091,6 +1095,7 @@ define amdgpu_kernel void @global_wavefront_release_store( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -1129,6 +1134,7 @@ define amdgpu_kernel void @global_wavefront_release_store( ; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s6 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s5 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -1139,6 +1145,7 @@ define amdgpu_kernel void @global_wavefront_release_store( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -1159,6 +1166,7 @@ define amdgpu_kernel void @global_wavefront_release_store( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -1233,6 +1241,7 @@ define amdgpu_kernel void @global_wavefront_seq_cst_store( ; GFX6-NEXT: s_mov_b32 s6, s10 ; GFX6-NEXT: s_mov_b32 s7, s9 ; GFX6-NEXT: v_mov_b32_e32 v0, s8 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; @@ -1247,6 +1256,7 @@ define amdgpu_kernel void @global_wavefront_seq_cst_store( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -1285,6 +1295,7 @@ define amdgpu_kernel void @global_wavefront_seq_cst_store( ; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s6 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s5 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -1295,6 +1306,7 @@ define amdgpu_kernel void @global_wavefront_seq_cst_store( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -1315,6 +1327,7 @@ define amdgpu_kernel void @global_wavefront_seq_cst_store( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -1696,6 +1709,7 @@ define amdgpu_kernel void @global_wavefront_release_atomicrmw( ; GFX6-NEXT: s_mov_b32 s6, s10 ; GFX6-NEXT: s_mov_b32 s7, s9 ; GFX6-NEXT: v_mov_b32_e32 v0, s8 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; @@ -1710,6 +1724,7 @@ define amdgpu_kernel void @global_wavefront_release_atomicrmw( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_swap v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -1747,6 +1762,7 @@ define amdgpu_kernel void @global_wavefront_release_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s6 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s5 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -1757,6 +1773,7 @@ define amdgpu_kernel void @global_wavefront_release_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[4:5] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -1777,6 +1794,7 @@ define amdgpu_kernel void @global_wavefront_release_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -1850,6 +1868,7 @@ define amdgpu_kernel void @global_wavefront_acq_rel_atomicrmw( ; GFX6-NEXT: s_mov_b32 s6, s10 ; GFX6-NEXT: s_mov_b32 s7, s9 ; GFX6-NEXT: v_mov_b32_e32 v0, s8 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; @@ -1864,6 +1883,7 @@ define amdgpu_kernel void @global_wavefront_acq_rel_atomicrmw( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_swap v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -1901,6 +1921,7 @@ define amdgpu_kernel void @global_wavefront_acq_rel_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s6 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s5 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -1911,6 +1932,7 @@ define amdgpu_kernel void @global_wavefront_acq_rel_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[4:5] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -1931,6 +1953,7 @@ define amdgpu_kernel void @global_wavefront_acq_rel_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -2004,6 +2027,7 @@ define amdgpu_kernel void @global_wavefront_seq_cst_atomicrmw( ; GFX6-NEXT: s_mov_b32 s6, s10 ; GFX6-NEXT: s_mov_b32 s7, s9 ; GFX6-NEXT: v_mov_b32_e32 v0, s8 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; @@ -2018,6 +2042,7 @@ define amdgpu_kernel void @global_wavefront_seq_cst_atomicrmw( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_swap v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -2055,6 +2080,7 @@ define amdgpu_kernel void @global_wavefront_seq_cst_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s6 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s5 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -2065,6 +2091,7 @@ define amdgpu_kernel void @global_wavefront_seq_cst_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[4:5] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -2085,6 +2112,7 @@ define amdgpu_kernel void @global_wavefront_seq_cst_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -2341,6 +2369,7 @@ define amdgpu_kernel void @global_wavefront_acq_rel_ret_atomicrmw( ; GFX6-NEXT: s_mov_b32 s6, s10 ; GFX6-NEXT: s_mov_b32 s7, s9 ; GFX6-NEXT: v_mov_b32_e32 v0, s8 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 @@ -2357,6 +2386,7 @@ define amdgpu_kernel void @global_wavefront_acq_rel_ret_atomicrmw( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s6 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -2402,6 +2432,7 @@ define amdgpu_kernel void @global_wavefront_acq_rel_ret_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s6 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s5 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -2414,6 +2445,7 @@ define amdgpu_kernel void @global_wavefront_acq_rel_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[4:5] glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] @@ -2438,6 +2470,7 @@ define amdgpu_kernel void @global_wavefront_acq_rel_ret_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[0:1] sc0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] @@ -2524,6 +2557,7 @@ define amdgpu_kernel void @global_wavefront_seq_cst_ret_atomicrmw( ; GFX6-NEXT: s_mov_b32 s6, s10 ; GFX6-NEXT: s_mov_b32 s7, s9 ; GFX6-NEXT: v_mov_b32_e32 v0, s8 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 @@ -2540,6 +2574,7 @@ define amdgpu_kernel void @global_wavefront_seq_cst_ret_atomicrmw( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s6 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -2585,6 +2620,7 @@ define amdgpu_kernel void @global_wavefront_seq_cst_ret_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s6 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s5 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -2597,6 +2633,7 @@ define amdgpu_kernel void @global_wavefront_seq_cst_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[4:5] glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] @@ -2621,6 +2658,7 @@ define amdgpu_kernel void @global_wavefront_seq_cst_ret_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[0:1] sc0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] @@ -3150,6 +3188,7 @@ define amdgpu_kernel void @global_wavefront_release_monotonic_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; GFX6-NEXT: s_endpgm ; @@ -3178,6 +3217,7 @@ define amdgpu_kernel void @global_wavefront_release_monotonic_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm ; @@ -3228,6 +3268,7 @@ define amdgpu_kernel void @global_wavefront_release_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -3242,6 +3283,7 @@ define amdgpu_kernel void @global_wavefront_release_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -3270,6 +3312,7 @@ define amdgpu_kernel void @global_wavefront_release_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -3369,6 +3412,7 @@ define amdgpu_kernel void @global_wavefront_acq_rel_monotonic_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; GFX6-NEXT: s_endpgm ; @@ -3397,6 +3441,7 @@ define amdgpu_kernel void @global_wavefront_acq_rel_monotonic_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm ; @@ -3447,6 +3492,7 @@ define amdgpu_kernel void @global_wavefront_acq_rel_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -3461,6 +3507,7 @@ define amdgpu_kernel void @global_wavefront_acq_rel_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -3489,6 +3536,7 @@ define amdgpu_kernel void @global_wavefront_acq_rel_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -3588,6 +3636,7 @@ define amdgpu_kernel void @global_wavefront_seq_cst_monotonic_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; GFX6-NEXT: s_endpgm ; @@ -3616,6 +3665,7 @@ define amdgpu_kernel void @global_wavefront_seq_cst_monotonic_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm ; @@ -3666,6 +3716,7 @@ define amdgpu_kernel void @global_wavefront_seq_cst_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -3680,6 +3731,7 @@ define amdgpu_kernel void @global_wavefront_seq_cst_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -3708,6 +3760,7 @@ define amdgpu_kernel void @global_wavefront_seq_cst_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -4245,6 +4298,7 @@ define amdgpu_kernel void @global_wavefront_release_acquire_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; GFX6-NEXT: s_endpgm ; @@ -4273,6 +4327,7 @@ define amdgpu_kernel void @global_wavefront_release_acquire_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm ; @@ -4323,6 +4378,7 @@ define amdgpu_kernel void @global_wavefront_release_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -4337,6 +4393,7 @@ define amdgpu_kernel void @global_wavefront_release_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -4365,6 +4422,7 @@ define amdgpu_kernel void @global_wavefront_release_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -4464,6 +4522,7 @@ define amdgpu_kernel void @global_wavefront_acq_rel_acquire_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; GFX6-NEXT: s_endpgm ; @@ -4492,6 +4551,7 @@ define amdgpu_kernel void @global_wavefront_acq_rel_acquire_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm ; @@ -4542,6 +4602,7 @@ define amdgpu_kernel void @global_wavefront_acq_rel_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -4556,6 +4617,7 @@ define amdgpu_kernel void @global_wavefront_acq_rel_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -4584,6 +4646,7 @@ define amdgpu_kernel void @global_wavefront_acq_rel_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -4683,6 +4746,7 @@ define amdgpu_kernel void @global_wavefront_seq_cst_acquire_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; GFX6-NEXT: s_endpgm ; @@ -4711,6 +4775,7 @@ define amdgpu_kernel void @global_wavefront_seq_cst_acquire_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm ; @@ -4761,6 +4826,7 @@ define amdgpu_kernel void @global_wavefront_seq_cst_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -4775,6 +4841,7 @@ define amdgpu_kernel void @global_wavefront_seq_cst_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -4803,6 +4870,7 @@ define amdgpu_kernel void @global_wavefront_seq_cst_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -4902,6 +4970,7 @@ define amdgpu_kernel void @global_wavefront_monotonic_seq_cst_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; GFX6-NEXT: s_endpgm ; @@ -4930,6 +4999,7 @@ define amdgpu_kernel void @global_wavefront_monotonic_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm ; @@ -4980,6 +5050,7 @@ define amdgpu_kernel void @global_wavefront_monotonic_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -4994,6 +5065,7 @@ define amdgpu_kernel void @global_wavefront_monotonic_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -5022,6 +5094,7 @@ define amdgpu_kernel void @global_wavefront_monotonic_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -5121,6 +5194,7 @@ define amdgpu_kernel void @global_wavefront_acquire_seq_cst_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; GFX6-NEXT: s_endpgm ; @@ -5149,6 +5223,7 @@ define amdgpu_kernel void @global_wavefront_acquire_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm ; @@ -5199,6 +5274,7 @@ define amdgpu_kernel void @global_wavefront_acquire_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -5213,6 +5289,7 @@ define amdgpu_kernel void @global_wavefront_acquire_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -5241,6 +5318,7 @@ define amdgpu_kernel void @global_wavefront_acquire_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -5340,6 +5418,7 @@ define amdgpu_kernel void @global_wavefront_release_seq_cst_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; GFX6-NEXT: s_endpgm ; @@ -5368,6 +5447,7 @@ define amdgpu_kernel void @global_wavefront_release_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm ; @@ -5418,6 +5498,7 @@ define amdgpu_kernel void @global_wavefront_release_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -5432,6 +5513,7 @@ define amdgpu_kernel void @global_wavefront_release_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -5460,6 +5542,7 @@ define amdgpu_kernel void @global_wavefront_release_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -5559,6 +5642,7 @@ define amdgpu_kernel void @global_wavefront_acq_rel_seq_cst_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; GFX6-NEXT: s_endpgm ; @@ -5587,6 +5671,7 @@ define amdgpu_kernel void @global_wavefront_acq_rel_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm ; @@ -5637,6 +5722,7 @@ define amdgpu_kernel void @global_wavefront_acq_rel_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -5651,6 +5737,7 @@ define amdgpu_kernel void @global_wavefront_acq_rel_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -5679,6 +5766,7 @@ define amdgpu_kernel void @global_wavefront_acq_rel_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -5778,6 +5866,7 @@ define amdgpu_kernel void @global_wavefront_seq_cst_seq_cst_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; GFX6-NEXT: s_endpgm ; @@ -5806,6 +5895,7 @@ define amdgpu_kernel void @global_wavefront_seq_cst_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm ; @@ -5856,6 +5946,7 @@ define amdgpu_kernel void @global_wavefront_seq_cst_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -5870,6 +5961,7 @@ define amdgpu_kernel void @global_wavefront_seq_cst_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -5898,6 +5990,7 @@ define amdgpu_kernel void @global_wavefront_seq_cst_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -6499,6 +6592,7 @@ define amdgpu_kernel void @global_wavefront_release_monotonic_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -6530,6 +6624,7 @@ define amdgpu_kernel void @global_wavefront_release_monotonic_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -6588,6 +6683,7 @@ define amdgpu_kernel void @global_wavefront_release_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) @@ -6605,6 +6701,7 @@ define amdgpu_kernel void @global_wavefront_release_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] @@ -6637,6 +6734,7 @@ define amdgpu_kernel void @global_wavefront_release_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] @@ -6750,6 +6848,7 @@ define amdgpu_kernel void @global_wavefront_acq_rel_monotonic_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -6781,6 +6880,7 @@ define amdgpu_kernel void @global_wavefront_acq_rel_monotonic_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -6839,6 +6939,7 @@ define amdgpu_kernel void @global_wavefront_acq_rel_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) @@ -6856,6 +6957,7 @@ define amdgpu_kernel void @global_wavefront_acq_rel_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] @@ -6888,6 +6990,7 @@ define amdgpu_kernel void @global_wavefront_acq_rel_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] @@ -7001,6 +7104,7 @@ define amdgpu_kernel void @global_wavefront_seq_cst_monotonic_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -7032,6 +7136,7 @@ define amdgpu_kernel void @global_wavefront_seq_cst_monotonic_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -7090,6 +7195,7 @@ define amdgpu_kernel void @global_wavefront_seq_cst_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) @@ -7107,6 +7213,7 @@ define amdgpu_kernel void @global_wavefront_seq_cst_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] @@ -7139,6 +7246,7 @@ define amdgpu_kernel void @global_wavefront_seq_cst_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] @@ -7754,6 +7862,7 @@ define amdgpu_kernel void @global_wavefront_release_acquire_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -7785,6 +7894,7 @@ define amdgpu_kernel void @global_wavefront_release_acquire_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -7843,6 +7953,7 @@ define amdgpu_kernel void @global_wavefront_release_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) @@ -7860,6 +7971,7 @@ define amdgpu_kernel void @global_wavefront_release_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] @@ -7892,6 +8004,7 @@ define amdgpu_kernel void @global_wavefront_release_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] @@ -8005,6 +8118,7 @@ define amdgpu_kernel void @global_wavefront_acq_rel_acquire_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -8036,6 +8150,7 @@ define amdgpu_kernel void @global_wavefront_acq_rel_acquire_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -8094,6 +8209,7 @@ define amdgpu_kernel void @global_wavefront_acq_rel_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) @@ -8111,6 +8227,7 @@ define amdgpu_kernel void @global_wavefront_acq_rel_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] @@ -8143,6 +8260,7 @@ define amdgpu_kernel void @global_wavefront_acq_rel_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] @@ -8256,6 +8374,7 @@ define amdgpu_kernel void @global_wavefront_seq_cst_acquire_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -8287,6 +8406,7 @@ define amdgpu_kernel void @global_wavefront_seq_cst_acquire_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -8345,6 +8465,7 @@ define amdgpu_kernel void @global_wavefront_seq_cst_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) @@ -8362,6 +8483,7 @@ define amdgpu_kernel void @global_wavefront_seq_cst_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] @@ -8394,6 +8516,7 @@ define amdgpu_kernel void @global_wavefront_seq_cst_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] @@ -8507,6 +8630,7 @@ define amdgpu_kernel void @global_wavefront_monotonic_seq_cst_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -8538,6 +8662,7 @@ define amdgpu_kernel void @global_wavefront_monotonic_seq_cst_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -8596,6 +8721,7 @@ define amdgpu_kernel void @global_wavefront_monotonic_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) @@ -8613,6 +8739,7 @@ define amdgpu_kernel void @global_wavefront_monotonic_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] @@ -8645,6 +8772,7 @@ define amdgpu_kernel void @global_wavefront_monotonic_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] @@ -8758,6 +8886,7 @@ define amdgpu_kernel void @global_wavefront_acquire_seq_cst_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -8789,6 +8918,7 @@ define amdgpu_kernel void @global_wavefront_acquire_seq_cst_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -8847,6 +8977,7 @@ define amdgpu_kernel void @global_wavefront_acquire_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) @@ -8864,6 +8995,7 @@ define amdgpu_kernel void @global_wavefront_acquire_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] @@ -8896,6 +9028,7 @@ define amdgpu_kernel void @global_wavefront_acquire_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] @@ -9009,6 +9142,7 @@ define amdgpu_kernel void @global_wavefront_release_seq_cst_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -9040,6 +9174,7 @@ define amdgpu_kernel void @global_wavefront_release_seq_cst_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -9098,6 +9233,7 @@ define amdgpu_kernel void @global_wavefront_release_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) @@ -9115,6 +9251,7 @@ define amdgpu_kernel void @global_wavefront_release_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] @@ -9147,6 +9284,7 @@ define amdgpu_kernel void @global_wavefront_release_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] @@ -9260,6 +9398,7 @@ define amdgpu_kernel void @global_wavefront_acq_rel_seq_cst_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -9291,6 +9430,7 @@ define amdgpu_kernel void @global_wavefront_acq_rel_seq_cst_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -9349,6 +9489,7 @@ define amdgpu_kernel void @global_wavefront_acq_rel_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) @@ -9366,6 +9507,7 @@ define amdgpu_kernel void @global_wavefront_acq_rel_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] @@ -9398,6 +9540,7 @@ define amdgpu_kernel void @global_wavefront_acq_rel_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] @@ -9511,6 +9654,7 @@ define amdgpu_kernel void @global_wavefront_seq_cst_seq_cst_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -9542,6 +9686,7 @@ define amdgpu_kernel void @global_wavefront_seq_cst_seq_cst_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -9600,6 +9745,7 @@ define amdgpu_kernel void @global_wavefront_seq_cst_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) @@ -9617,6 +9763,7 @@ define amdgpu_kernel void @global_wavefront_seq_cst_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] @@ -9649,6 +9796,7 @@ define amdgpu_kernel void @global_wavefront_seq_cst_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-workgroup.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-workgroup.ll index 69b0c7f93ab0e..d64474a92847b 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-workgroup.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-workgroup.ll @@ -591,7 +591,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_load( ; GFX6-NEXT: s_mov_b32 s5, s14 ; GFX6-NEXT: s_mov_b32 s6, s13 ; GFX6-NEXT: s_mov_b32 s7, s12 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: buffer_load_dword v0, off, s[8:11], 0 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 @@ -607,7 +607,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_load( ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: flat_load_dword v2, v[0:1] ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -659,7 +659,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_load( ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s10 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s9 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s8 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -670,7 +670,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_load( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] @@ -693,7 +693,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_load( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[2:3] sc0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] @@ -1099,7 +1099,7 @@ define amdgpu_kernel void @global_workgroup_release_store( ; GFX6-NEXT: s_mov_b32 s6, s10 ; GFX6-NEXT: s_mov_b32 s7, s9 ; GFX6-NEXT: v_mov_b32_e32 v0, s8 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; @@ -1114,7 +1114,7 @@ define amdgpu_kernel void @global_workgroup_release_store( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -1156,7 +1156,7 @@ define amdgpu_kernel void @global_workgroup_release_store( ; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s6 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s5 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -1167,7 +1167,7 @@ define amdgpu_kernel void @global_workgroup_release_store( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -1189,7 +1189,7 @@ define amdgpu_kernel void @global_workgroup_release_store( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] sc0 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -1273,7 +1273,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_store( ; GFX6-NEXT: s_mov_b32 s6, s10 ; GFX6-NEXT: s_mov_b32 s7, s9 ; GFX6-NEXT: v_mov_b32_e32 v0, s8 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; @@ -1288,7 +1288,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_store( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -1330,7 +1330,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_store( ; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s6 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s5 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -1341,7 +1341,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_store( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -1363,7 +1363,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_store( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] sc0 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -1764,7 +1764,7 @@ define amdgpu_kernel void @global_workgroup_release_atomicrmw( ; GFX6-NEXT: s_mov_b32 s6, s10 ; GFX6-NEXT: s_mov_b32 s7, s9 ; GFX6-NEXT: v_mov_b32_e32 v0, s8 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; @@ -1779,7 +1779,7 @@ define amdgpu_kernel void @global_workgroup_release_atomicrmw( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: flat_atomic_swap v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -1820,7 +1820,7 @@ define amdgpu_kernel void @global_workgroup_release_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s6 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s5 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -1831,7 +1831,7 @@ define amdgpu_kernel void @global_workgroup_release_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[4:5] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -1853,7 +1853,7 @@ define amdgpu_kernel void @global_workgroup_release_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -1936,7 +1936,7 @@ define amdgpu_kernel void @global_workgroup_acq_rel_atomicrmw( ; GFX6-NEXT: s_mov_b32 s6, s10 ; GFX6-NEXT: s_mov_b32 s7, s9 ; GFX6-NEXT: v_mov_b32_e32 v0, s8 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; @@ -1951,7 +1951,7 @@ define amdgpu_kernel void @global_workgroup_acq_rel_atomicrmw( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: flat_atomic_swap v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -1994,7 +1994,7 @@ define amdgpu_kernel void @global_workgroup_acq_rel_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s6 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s5 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -2005,7 +2005,7 @@ define amdgpu_kernel void @global_workgroup_acq_rel_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[4:5] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -2029,7 +2029,7 @@ define amdgpu_kernel void @global_workgroup_acq_rel_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -2118,7 +2118,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_atomicrmw( ; GFX6-NEXT: s_mov_b32 s6, s10 ; GFX6-NEXT: s_mov_b32 s7, s9 ; GFX6-NEXT: v_mov_b32_e32 v0, s8 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; @@ -2133,7 +2133,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_atomicrmw( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: flat_atomic_swap v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -2176,7 +2176,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s6 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s5 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -2187,7 +2187,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[4:5] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -2211,7 +2211,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -2488,7 +2488,7 @@ define amdgpu_kernel void @global_workgroup_acq_rel_ret_atomicrmw( ; GFX6-NEXT: s_mov_b32 s6, s10 ; GFX6-NEXT: s_mov_b32 s7, s9 ; GFX6-NEXT: v_mov_b32_e32 v0, s8 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 @@ -2505,7 +2505,7 @@ define amdgpu_kernel void @global_workgroup_acq_rel_ret_atomicrmw( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s6 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -2555,7 +2555,7 @@ define amdgpu_kernel void @global_workgroup_acq_rel_ret_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s6 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s5 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -2568,7 +2568,7 @@ define amdgpu_kernel void @global_workgroup_acq_rel_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[4:5] glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] @@ -2595,7 +2595,7 @@ define amdgpu_kernel void @global_workgroup_acq_rel_ret_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[0:1] sc0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] @@ -2696,7 +2696,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_ret_atomicrmw( ; GFX6-NEXT: s_mov_b32 s6, s10 ; GFX6-NEXT: s_mov_b32 s7, s9 ; GFX6-NEXT: v_mov_b32_e32 v0, s8 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 @@ -2713,7 +2713,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_ret_atomicrmw( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s6 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -2763,7 +2763,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_ret_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s6 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s5 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -2776,7 +2776,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[4:5] glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] @@ -2803,7 +2803,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_ret_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[0:1] sc0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] @@ -3357,7 +3357,7 @@ define amdgpu_kernel void @global_workgroup_release_monotonic_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; GFX6-NEXT: s_endpgm ; @@ -3386,7 +3386,7 @@ define amdgpu_kernel void @global_workgroup_release_monotonic_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm ; @@ -3440,7 +3440,7 @@ define amdgpu_kernel void @global_workgroup_release_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -3455,7 +3455,7 @@ define amdgpu_kernel void @global_workgroup_release_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -3485,7 +3485,7 @@ define amdgpu_kernel void @global_workgroup_release_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -3594,7 +3594,7 @@ define amdgpu_kernel void @global_workgroup_acq_rel_monotonic_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; GFX6-NEXT: s_endpgm ; @@ -3623,7 +3623,7 @@ define amdgpu_kernel void @global_workgroup_acq_rel_monotonic_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm ; @@ -3679,7 +3679,7 @@ define amdgpu_kernel void @global_workgroup_acq_rel_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -3694,7 +3694,7 @@ define amdgpu_kernel void @global_workgroup_acq_rel_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -3726,7 +3726,7 @@ define amdgpu_kernel void @global_workgroup_acq_rel_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -3841,7 +3841,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_monotonic_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; GFX6-NEXT: s_endpgm ; @@ -3870,7 +3870,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_monotonic_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm ; @@ -3926,7 +3926,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -3941,7 +3941,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -3973,7 +3973,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -4546,7 +4546,7 @@ define amdgpu_kernel void @global_workgroup_release_acquire_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; GFX6-NEXT: s_endpgm ; @@ -4575,7 +4575,7 @@ define amdgpu_kernel void @global_workgroup_release_acquire_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm ; @@ -4631,7 +4631,7 @@ define amdgpu_kernel void @global_workgroup_release_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -4646,7 +4646,7 @@ define amdgpu_kernel void @global_workgroup_release_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -4678,7 +4678,7 @@ define amdgpu_kernel void @global_workgroup_release_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -4793,7 +4793,7 @@ define amdgpu_kernel void @global_workgroup_acq_rel_acquire_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; GFX6-NEXT: s_endpgm ; @@ -4822,7 +4822,7 @@ define amdgpu_kernel void @global_workgroup_acq_rel_acquire_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm ; @@ -4878,7 +4878,7 @@ define amdgpu_kernel void @global_workgroup_acq_rel_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -4893,7 +4893,7 @@ define amdgpu_kernel void @global_workgroup_acq_rel_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -4925,7 +4925,7 @@ define amdgpu_kernel void @global_workgroup_acq_rel_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -5040,7 +5040,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_acquire_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; GFX6-NEXT: s_endpgm ; @@ -5069,7 +5069,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_acquire_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm ; @@ -5125,7 +5125,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -5140,7 +5140,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -5172,7 +5172,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -5287,7 +5287,7 @@ define amdgpu_kernel void @global_workgroup_monotonic_seq_cst_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; GFX6-NEXT: s_endpgm ; @@ -5316,7 +5316,7 @@ define amdgpu_kernel void @global_workgroup_monotonic_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm ; @@ -5372,7 +5372,7 @@ define amdgpu_kernel void @global_workgroup_monotonic_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -5387,7 +5387,7 @@ define amdgpu_kernel void @global_workgroup_monotonic_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -5419,7 +5419,7 @@ define amdgpu_kernel void @global_workgroup_monotonic_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -5534,7 +5534,7 @@ define amdgpu_kernel void @global_workgroup_acquire_seq_cst_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; GFX6-NEXT: s_endpgm ; @@ -5563,7 +5563,7 @@ define amdgpu_kernel void @global_workgroup_acquire_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm ; @@ -5619,7 +5619,7 @@ define amdgpu_kernel void @global_workgroup_acquire_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -5634,7 +5634,7 @@ define amdgpu_kernel void @global_workgroup_acquire_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -5666,7 +5666,7 @@ define amdgpu_kernel void @global_workgroup_acquire_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -5781,7 +5781,7 @@ define amdgpu_kernel void @global_workgroup_release_seq_cst_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; GFX6-NEXT: s_endpgm ; @@ -5810,7 +5810,7 @@ define amdgpu_kernel void @global_workgroup_release_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm ; @@ -5866,7 +5866,7 @@ define amdgpu_kernel void @global_workgroup_release_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -5881,7 +5881,7 @@ define amdgpu_kernel void @global_workgroup_release_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -5913,7 +5913,7 @@ define amdgpu_kernel void @global_workgroup_release_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -6028,7 +6028,7 @@ define amdgpu_kernel void @global_workgroup_acq_rel_seq_cst_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; GFX6-NEXT: s_endpgm ; @@ -6057,7 +6057,7 @@ define amdgpu_kernel void @global_workgroup_acq_rel_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm ; @@ -6113,7 +6113,7 @@ define amdgpu_kernel void @global_workgroup_acq_rel_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -6128,7 +6128,7 @@ define amdgpu_kernel void @global_workgroup_acq_rel_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -6160,7 +6160,7 @@ define amdgpu_kernel void @global_workgroup_acq_rel_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -6275,7 +6275,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_seq_cst_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; GFX6-NEXT: s_endpgm ; @@ -6304,7 +6304,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm ; @@ -6360,7 +6360,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -6375,7 +6375,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -6407,7 +6407,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -7029,7 +7029,7 @@ define amdgpu_kernel void @global_workgroup_release_monotonic_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -7061,7 +7061,7 @@ define amdgpu_kernel void @global_workgroup_release_monotonic_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -7123,7 +7123,7 @@ define amdgpu_kernel void @global_workgroup_release_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) @@ -7141,7 +7141,7 @@ define amdgpu_kernel void @global_workgroup_release_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] @@ -7175,7 +7175,7 @@ define amdgpu_kernel void @global_workgroup_release_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] @@ -7298,7 +7298,7 @@ define amdgpu_kernel void @global_workgroup_acq_rel_monotonic_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -7330,7 +7330,7 @@ define amdgpu_kernel void @global_workgroup_acq_rel_monotonic_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -7393,7 +7393,7 @@ define amdgpu_kernel void @global_workgroup_acq_rel_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) @@ -7411,7 +7411,7 @@ define amdgpu_kernel void @global_workgroup_acq_rel_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] @@ -7446,7 +7446,7 @@ define amdgpu_kernel void @global_workgroup_acq_rel_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] @@ -7574,7 +7574,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_monotonic_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -7606,7 +7606,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_monotonic_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -7669,7 +7669,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) @@ -7687,7 +7687,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] @@ -7722,7 +7722,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] @@ -8364,7 +8364,7 @@ define amdgpu_kernel void @global_workgroup_release_acquire_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -8396,7 +8396,7 @@ define amdgpu_kernel void @global_workgroup_release_acquire_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -8459,7 +8459,7 @@ define amdgpu_kernel void @global_workgroup_release_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) @@ -8477,7 +8477,7 @@ define amdgpu_kernel void @global_workgroup_release_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] @@ -8512,7 +8512,7 @@ define amdgpu_kernel void @global_workgroup_release_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] @@ -8640,7 +8640,7 @@ define amdgpu_kernel void @global_workgroup_acq_rel_acquire_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -8672,7 +8672,7 @@ define amdgpu_kernel void @global_workgroup_acq_rel_acquire_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -8735,7 +8735,7 @@ define amdgpu_kernel void @global_workgroup_acq_rel_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) @@ -8753,7 +8753,7 @@ define amdgpu_kernel void @global_workgroup_acq_rel_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] @@ -8788,7 +8788,7 @@ define amdgpu_kernel void @global_workgroup_acq_rel_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] @@ -8916,7 +8916,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_acquire_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -8948,7 +8948,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_acquire_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -9011,7 +9011,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) @@ -9029,7 +9029,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] @@ -9064,7 +9064,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] @@ -9192,7 +9192,7 @@ define amdgpu_kernel void @global_workgroup_monotonic_seq_cst_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -9224,7 +9224,7 @@ define amdgpu_kernel void @global_workgroup_monotonic_seq_cst_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -9287,7 +9287,7 @@ define amdgpu_kernel void @global_workgroup_monotonic_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) @@ -9305,7 +9305,7 @@ define amdgpu_kernel void @global_workgroup_monotonic_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] @@ -9340,7 +9340,7 @@ define amdgpu_kernel void @global_workgroup_monotonic_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] @@ -9468,7 +9468,7 @@ define amdgpu_kernel void @global_workgroup_acquire_seq_cst_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -9500,7 +9500,7 @@ define amdgpu_kernel void @global_workgroup_acquire_seq_cst_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -9563,7 +9563,7 @@ define amdgpu_kernel void @global_workgroup_acquire_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) @@ -9581,7 +9581,7 @@ define amdgpu_kernel void @global_workgroup_acquire_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] @@ -9616,7 +9616,7 @@ define amdgpu_kernel void @global_workgroup_acquire_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] @@ -9742,7 +9742,7 @@ define amdgpu_kernel void @global_workgroup_release_seq_cst_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -9774,7 +9774,7 @@ define amdgpu_kernel void @global_workgroup_release_seq_cst_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -9837,7 +9837,7 @@ define amdgpu_kernel void @global_workgroup_release_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) @@ -9855,7 +9855,7 @@ define amdgpu_kernel void @global_workgroup_release_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] @@ -9890,7 +9890,7 @@ define amdgpu_kernel void @global_workgroup_release_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] @@ -10018,7 +10018,7 @@ define amdgpu_kernel void @global_workgroup_acq_rel_seq_cst_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -10050,7 +10050,7 @@ define amdgpu_kernel void @global_workgroup_acq_rel_seq_cst_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -10113,7 +10113,7 @@ define amdgpu_kernel void @global_workgroup_acq_rel_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) @@ -10131,7 +10131,7 @@ define amdgpu_kernel void @global_workgroup_acq_rel_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] @@ -10166,7 +10166,7 @@ define amdgpu_kernel void @global_workgroup_acq_rel_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] @@ -10294,7 +10294,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_seq_cst_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -10326,7 +10326,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_seq_cst_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -10389,7 +10389,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) @@ -10407,7 +10407,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] @@ -10442,7 +10442,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-agent.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-agent.ll index 0467c5047a0be..24a859869bc08 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-agent.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-agent.ll @@ -366,7 +366,7 @@ define amdgpu_kernel void @local_agent_acquire_load( ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: ds_read_b32 v1, v0 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: ds_write_b32 v0, v1 @@ -380,7 +380,7 @@ define amdgpu_kernel void @local_agent_acquire_load( ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: ds_read_b32 v1, v0 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: ds_write_b32 v0, v1 @@ -419,7 +419,7 @@ define amdgpu_kernel void @local_agent_acquire_load( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: ds_read_b32 v1, v0 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 @@ -432,7 +432,7 @@ define amdgpu_kernel void @local_agent_acquire_load( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: ds_read_b32 v1, v0 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm @@ -457,7 +457,7 @@ define amdgpu_kernel void @local_agent_acquire_load( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: ds_read_b32 v1, v0 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm @@ -541,9 +541,9 @@ define amdgpu_kernel void @local_agent_seq_cst_load( ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: ds_read_b32 v1, v0 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: ds_write_b32 v0, v1 @@ -556,9 +556,9 @@ define amdgpu_kernel void @local_agent_seq_cst_load( ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: ds_read_b32 v1, v0 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: ds_write_b32 v0, v1 @@ -599,9 +599,9 @@ define amdgpu_kernel void @local_agent_seq_cst_load( ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_read_b32 v1, v0 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 @@ -613,9 +613,9 @@ define amdgpu_kernel void @local_agent_seq_cst_load( ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_read_b32 v1, v0 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm @@ -640,9 +640,9 @@ define amdgpu_kernel void @local_agent_seq_cst_load( ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_read_b32 v1, v0 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm @@ -1015,7 +1015,7 @@ define amdgpu_kernel void @local_agent_release_store( ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm ; @@ -1027,7 +1027,7 @@ define amdgpu_kernel void @local_agent_release_store( ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: v_mov_b32_e32 v1, s4 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: ds_write_b32 v0, v1 ; GFX7-NEXT: s_endpgm ; @@ -1062,7 +1062,7 @@ define amdgpu_kernel void @local_agent_release_store( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -1073,7 +1073,7 @@ define amdgpu_kernel void @local_agent_release_store( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -1095,7 +1095,7 @@ define amdgpu_kernel void @local_agent_release_store( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -1173,7 +1173,7 @@ define amdgpu_kernel void @local_agent_seq_cst_store( ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm ; @@ -1185,7 +1185,7 @@ define amdgpu_kernel void @local_agent_seq_cst_store( ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: v_mov_b32_e32 v1, s4 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: ds_write_b32 v0, v1 ; GFX7-NEXT: s_endpgm ; @@ -1220,7 +1220,7 @@ define amdgpu_kernel void @local_agent_seq_cst_store( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -1231,7 +1231,7 @@ define amdgpu_kernel void @local_agent_seq_cst_store( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -1253,7 +1253,7 @@ define amdgpu_kernel void @local_agent_seq_cst_store( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -1472,7 +1472,7 @@ define amdgpu_kernel void @local_agent_acquire_atomicrmw( ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_agent_acquire_atomicrmw: @@ -1484,7 +1484,7 @@ define amdgpu_kernel void @local_agent_acquire_atomicrmw( ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: v_mov_b32_e32 v1, s4 ; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_agent_acquire_atomicrmw: @@ -1519,7 +1519,7 @@ define amdgpu_kernel void @local_agent_acquire_atomicrmw( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_agent_acquire_atomicrmw: @@ -1530,7 +1530,7 @@ define amdgpu_kernel void @local_agent_acquire_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_agent_acquire_atomicrmw: @@ -1552,7 +1552,7 @@ define amdgpu_kernel void @local_agent_acquire_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_agent_acquire_atomicrmw: @@ -1627,7 +1627,7 @@ define amdgpu_kernel void @local_agent_release_atomicrmw( ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX6-NEXT: s_endpgm ; @@ -1639,7 +1639,7 @@ define amdgpu_kernel void @local_agent_release_atomicrmw( ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: v_mov_b32_e32 v1, s4 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX7-NEXT: s_endpgm ; @@ -1674,7 +1674,7 @@ define amdgpu_kernel void @local_agent_release_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -1685,7 +1685,7 @@ define amdgpu_kernel void @local_agent_release_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -1707,7 +1707,7 @@ define amdgpu_kernel void @local_agent_release_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -1785,9 +1785,9 @@ define amdgpu_kernel void @local_agent_acq_rel_atomicrmw( ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_agent_acq_rel_atomicrmw: @@ -1798,9 +1798,9 @@ define amdgpu_kernel void @local_agent_acq_rel_atomicrmw( ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: v_mov_b32_e32 v1, s4 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_agent_acq_rel_atomicrmw: @@ -1837,9 +1837,9 @@ define amdgpu_kernel void @local_agent_acq_rel_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_agent_acq_rel_atomicrmw: @@ -1849,9 +1849,9 @@ define amdgpu_kernel void @local_agent_acq_rel_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_agent_acq_rel_atomicrmw: @@ -1873,9 +1873,9 @@ define amdgpu_kernel void @local_agent_acq_rel_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_agent_acq_rel_atomicrmw: @@ -1959,9 +1959,9 @@ define amdgpu_kernel void @local_agent_seq_cst_atomicrmw( ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_agent_seq_cst_atomicrmw: @@ -1972,9 +1972,9 @@ define amdgpu_kernel void @local_agent_seq_cst_atomicrmw( ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: v_mov_b32_e32 v1, s4 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_agent_seq_cst_atomicrmw: @@ -2011,9 +2011,9 @@ define amdgpu_kernel void @local_agent_seq_cst_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_agent_seq_cst_atomicrmw: @@ -2023,9 +2023,9 @@ define amdgpu_kernel void @local_agent_seq_cst_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_agent_seq_cst_atomicrmw: @@ -2047,9 +2047,9 @@ define amdgpu_kernel void @local_agent_seq_cst_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_agent_seq_cst_atomicrmw: @@ -2134,7 +2134,7 @@ define amdgpu_kernel void @local_agent_acquire_ret_atomicrmw( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: ds_write_b32 v0, v1 @@ -2149,7 +2149,7 @@ define amdgpu_kernel void @local_agent_acquire_ret_atomicrmw( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: ds_write_b32 v0, v1 @@ -2191,7 +2191,7 @@ define amdgpu_kernel void @local_agent_acquire_ret_atomicrmw( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 @@ -2205,7 +2205,7 @@ define amdgpu_kernel void @local_agent_acquire_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm @@ -2232,7 +2232,7 @@ define amdgpu_kernel void @local_agent_acquire_ret_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm @@ -2321,9 +2321,9 @@ define amdgpu_kernel void @local_agent_acq_rel_ret_atomicrmw( ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: ds_write_b32 v0, v1 @@ -2337,9 +2337,9 @@ define amdgpu_kernel void @local_agent_acq_rel_ret_atomicrmw( ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: ds_write_b32 v0, v1 @@ -2383,9 +2383,9 @@ define amdgpu_kernel void @local_agent_acq_rel_ret_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 @@ -2398,9 +2398,9 @@ define amdgpu_kernel void @local_agent_acq_rel_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm @@ -2427,9 +2427,9 @@ define amdgpu_kernel void @local_agent_acq_rel_ret_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm @@ -2527,9 +2527,9 @@ define amdgpu_kernel void @local_agent_seq_cst_ret_atomicrmw( ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: ds_write_b32 v0, v1 @@ -2543,9 +2543,9 @@ define amdgpu_kernel void @local_agent_seq_cst_ret_atomicrmw( ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: ds_write_b32 v0, v1 @@ -2589,9 +2589,9 @@ define amdgpu_kernel void @local_agent_seq_cst_ret_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 @@ -2604,9 +2604,9 @@ define amdgpu_kernel void @local_agent_seq_cst_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm @@ -2633,9 +2633,9 @@ define amdgpu_kernel void @local_agent_seq_cst_ret_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm @@ -2903,7 +2903,7 @@ define amdgpu_kernel void @local_agent_acquire_monotonic_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_agent_acquire_monotonic_cmpxchg: @@ -2917,7 +2917,7 @@ define amdgpu_kernel void @local_agent_acquire_monotonic_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_agent_acquire_monotonic_cmpxchg: @@ -2958,7 +2958,7 @@ define amdgpu_kernel void @local_agent_acquire_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_agent_acquire_monotonic_cmpxchg: @@ -2971,7 +2971,7 @@ define amdgpu_kernel void @local_agent_acquire_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_agent_acquire_monotonic_cmpxchg: @@ -2997,7 +2997,7 @@ define amdgpu_kernel void @local_agent_acquire_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_agent_acquire_monotonic_cmpxchg: @@ -3085,7 +3085,7 @@ define amdgpu_kernel void @local_agent_release_monotonic_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX6-NEXT: s_endpgm ; @@ -3099,7 +3099,7 @@ define amdgpu_kernel void @local_agent_release_monotonic_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX7-NEXT: s_endpgm ; @@ -3140,7 +3140,7 @@ define amdgpu_kernel void @local_agent_release_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -3153,7 +3153,7 @@ define amdgpu_kernel void @local_agent_release_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -3179,7 +3179,7 @@ define amdgpu_kernel void @local_agent_release_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -3270,9 +3270,9 @@ define amdgpu_kernel void @local_agent_acq_rel_monotonic_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_agent_acq_rel_monotonic_cmpxchg: @@ -3285,9 +3285,9 @@ define amdgpu_kernel void @local_agent_acq_rel_monotonic_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_agent_acq_rel_monotonic_cmpxchg: @@ -3330,9 +3330,9 @@ define amdgpu_kernel void @local_agent_acq_rel_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_agent_acq_rel_monotonic_cmpxchg: @@ -3344,9 +3344,9 @@ define amdgpu_kernel void @local_agent_acq_rel_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_agent_acq_rel_monotonic_cmpxchg: @@ -3372,9 +3372,9 @@ define amdgpu_kernel void @local_agent_acq_rel_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_agent_acq_rel_monotonic_cmpxchg: @@ -3471,9 +3471,9 @@ define amdgpu_kernel void @local_agent_seq_cst_monotonic_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_agent_seq_cst_monotonic_cmpxchg: @@ -3486,9 +3486,9 @@ define amdgpu_kernel void @local_agent_seq_cst_monotonic_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_agent_seq_cst_monotonic_cmpxchg: @@ -3531,9 +3531,9 @@ define amdgpu_kernel void @local_agent_seq_cst_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_agent_seq_cst_monotonic_cmpxchg: @@ -3545,9 +3545,9 @@ define amdgpu_kernel void @local_agent_seq_cst_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_agent_seq_cst_monotonic_cmpxchg: @@ -3573,9 +3573,9 @@ define amdgpu_kernel void @local_agent_seq_cst_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_agent_seq_cst_monotonic_cmpxchg: @@ -3673,7 +3673,7 @@ define amdgpu_kernel void @local_agent_monotonic_acquire_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_agent_monotonic_acquire_cmpxchg: @@ -3687,7 +3687,7 @@ define amdgpu_kernel void @local_agent_monotonic_acquire_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_agent_monotonic_acquire_cmpxchg: @@ -3728,7 +3728,7 @@ define amdgpu_kernel void @local_agent_monotonic_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_agent_monotonic_acquire_cmpxchg: @@ -3741,7 +3741,7 @@ define amdgpu_kernel void @local_agent_monotonic_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_agent_monotonic_acquire_cmpxchg: @@ -3767,7 +3767,7 @@ define amdgpu_kernel void @local_agent_monotonic_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_agent_monotonic_acquire_cmpxchg: @@ -3856,7 +3856,7 @@ define amdgpu_kernel void @local_agent_acquire_acquire_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_agent_acquire_acquire_cmpxchg: @@ -3870,7 +3870,7 @@ define amdgpu_kernel void @local_agent_acquire_acquire_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_agent_acquire_acquire_cmpxchg: @@ -3911,7 +3911,7 @@ define amdgpu_kernel void @local_agent_acquire_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_agent_acquire_acquire_cmpxchg: @@ -3924,7 +3924,7 @@ define amdgpu_kernel void @local_agent_acquire_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_agent_acquire_acquire_cmpxchg: @@ -3950,7 +3950,7 @@ define amdgpu_kernel void @local_agent_acquire_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_agent_acquire_acquire_cmpxchg: @@ -4038,9 +4038,9 @@ define amdgpu_kernel void @local_agent_release_acquire_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_agent_release_acquire_cmpxchg: @@ -4053,9 +4053,9 @@ define amdgpu_kernel void @local_agent_release_acquire_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_agent_release_acquire_cmpxchg: @@ -4098,9 +4098,9 @@ define amdgpu_kernel void @local_agent_release_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_agent_release_acquire_cmpxchg: @@ -4112,9 +4112,9 @@ define amdgpu_kernel void @local_agent_release_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_agent_release_acquire_cmpxchg: @@ -4140,9 +4140,9 @@ define amdgpu_kernel void @local_agent_release_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_agent_release_acquire_cmpxchg: @@ -4239,9 +4239,9 @@ define amdgpu_kernel void @local_agent_acq_rel_acquire_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_agent_acq_rel_acquire_cmpxchg: @@ -4254,9 +4254,9 @@ define amdgpu_kernel void @local_agent_acq_rel_acquire_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_agent_acq_rel_acquire_cmpxchg: @@ -4299,9 +4299,9 @@ define amdgpu_kernel void @local_agent_acq_rel_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_agent_acq_rel_acquire_cmpxchg: @@ -4313,9 +4313,9 @@ define amdgpu_kernel void @local_agent_acq_rel_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_agent_acq_rel_acquire_cmpxchg: @@ -4341,9 +4341,9 @@ define amdgpu_kernel void @local_agent_acq_rel_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_agent_acq_rel_acquire_cmpxchg: @@ -4440,9 +4440,9 @@ define amdgpu_kernel void @local_agent_seq_cst_acquire_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_agent_seq_cst_acquire_cmpxchg: @@ -4455,9 +4455,9 @@ define amdgpu_kernel void @local_agent_seq_cst_acquire_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_agent_seq_cst_acquire_cmpxchg: @@ -4500,9 +4500,9 @@ define amdgpu_kernel void @local_agent_seq_cst_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_agent_seq_cst_acquire_cmpxchg: @@ -4514,9 +4514,9 @@ define amdgpu_kernel void @local_agent_seq_cst_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_agent_seq_cst_acquire_cmpxchg: @@ -4542,9 +4542,9 @@ define amdgpu_kernel void @local_agent_seq_cst_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_agent_seq_cst_acquire_cmpxchg: @@ -4641,9 +4641,9 @@ define amdgpu_kernel void @local_agent_monotonic_seq_cst_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_agent_monotonic_seq_cst_cmpxchg: @@ -4656,9 +4656,9 @@ define amdgpu_kernel void @local_agent_monotonic_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_agent_monotonic_seq_cst_cmpxchg: @@ -4701,9 +4701,9 @@ define amdgpu_kernel void @local_agent_monotonic_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_agent_monotonic_seq_cst_cmpxchg: @@ -4715,9 +4715,9 @@ define amdgpu_kernel void @local_agent_monotonic_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_agent_monotonic_seq_cst_cmpxchg: @@ -4743,9 +4743,9 @@ define amdgpu_kernel void @local_agent_monotonic_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_agent_monotonic_seq_cst_cmpxchg: @@ -4842,9 +4842,9 @@ define amdgpu_kernel void @local_agent_acquire_seq_cst_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_agent_acquire_seq_cst_cmpxchg: @@ -4857,9 +4857,9 @@ define amdgpu_kernel void @local_agent_acquire_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_agent_acquire_seq_cst_cmpxchg: @@ -4902,9 +4902,9 @@ define amdgpu_kernel void @local_agent_acquire_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_agent_acquire_seq_cst_cmpxchg: @@ -4916,9 +4916,9 @@ define amdgpu_kernel void @local_agent_acquire_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_agent_acquire_seq_cst_cmpxchg: @@ -4944,9 +4944,9 @@ define amdgpu_kernel void @local_agent_acquire_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_agent_acquire_seq_cst_cmpxchg: @@ -5043,9 +5043,9 @@ define amdgpu_kernel void @local_agent_release_seq_cst_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_agent_release_seq_cst_cmpxchg: @@ -5058,9 +5058,9 @@ define amdgpu_kernel void @local_agent_release_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_agent_release_seq_cst_cmpxchg: @@ -5103,9 +5103,9 @@ define amdgpu_kernel void @local_agent_release_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_agent_release_seq_cst_cmpxchg: @@ -5117,9 +5117,9 @@ define amdgpu_kernel void @local_agent_release_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_agent_release_seq_cst_cmpxchg: @@ -5145,9 +5145,9 @@ define amdgpu_kernel void @local_agent_release_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_agent_release_seq_cst_cmpxchg: @@ -5244,9 +5244,9 @@ define amdgpu_kernel void @local_agent_acq_rel_seq_cst_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_agent_acq_rel_seq_cst_cmpxchg: @@ -5259,9 +5259,9 @@ define amdgpu_kernel void @local_agent_acq_rel_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_agent_acq_rel_seq_cst_cmpxchg: @@ -5304,9 +5304,9 @@ define amdgpu_kernel void @local_agent_acq_rel_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_agent_acq_rel_seq_cst_cmpxchg: @@ -5318,9 +5318,9 @@ define amdgpu_kernel void @local_agent_acq_rel_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_agent_acq_rel_seq_cst_cmpxchg: @@ -5346,9 +5346,9 @@ define amdgpu_kernel void @local_agent_acq_rel_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_agent_acq_rel_seq_cst_cmpxchg: @@ -5445,9 +5445,9 @@ define amdgpu_kernel void @local_agent_seq_cst_seq_cst_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_agent_seq_cst_seq_cst_cmpxchg: @@ -5460,9 +5460,9 @@ define amdgpu_kernel void @local_agent_seq_cst_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_agent_seq_cst_seq_cst_cmpxchg: @@ -5505,9 +5505,9 @@ define amdgpu_kernel void @local_agent_seq_cst_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_agent_seq_cst_seq_cst_cmpxchg: @@ -5519,9 +5519,9 @@ define amdgpu_kernel void @local_agent_seq_cst_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_agent_seq_cst_seq_cst_cmpxchg: @@ -5547,9 +5547,9 @@ define amdgpu_kernel void @local_agent_seq_cst_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_agent_seq_cst_seq_cst_cmpxchg: @@ -5858,7 +5858,7 @@ define amdgpu_kernel void @local_agent_acquire_monotonic_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: ds_write_b32 v0, v1 @@ -5875,7 +5875,7 @@ define amdgpu_kernel void @local_agent_acquire_monotonic_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: ds_write_b32 v0, v1 @@ -5923,7 +5923,7 @@ define amdgpu_kernel void @local_agent_acquire_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 @@ -5939,7 +5939,7 @@ define amdgpu_kernel void @local_agent_acquire_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm @@ -5970,7 +5970,7 @@ define amdgpu_kernel void @local_agent_acquire_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm @@ -6073,7 +6073,7 @@ define amdgpu_kernel void @local_agent_release_monotonic_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 @@ -6091,7 +6091,7 @@ define amdgpu_kernel void @local_agent_release_monotonic_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -6142,7 +6142,7 @@ define amdgpu_kernel void @local_agent_release_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 @@ -6159,7 +6159,7 @@ define amdgpu_kernel void @local_agent_release_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -6191,7 +6191,7 @@ define amdgpu_kernel void @local_agent_release_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -6302,9 +6302,9 @@ define amdgpu_kernel void @local_agent_acq_rel_monotonic_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: ds_write_b32 v0, v1 @@ -6320,9 +6320,9 @@ define amdgpu_kernel void @local_agent_acq_rel_monotonic_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: ds_write_b32 v0, v1 @@ -6372,9 +6372,9 @@ define amdgpu_kernel void @local_agent_acq_rel_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 @@ -6389,9 +6389,9 @@ define amdgpu_kernel void @local_agent_acq_rel_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm @@ -6422,9 +6422,9 @@ define amdgpu_kernel void @local_agent_acq_rel_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm @@ -6536,9 +6536,9 @@ define amdgpu_kernel void @local_agent_seq_cst_monotonic_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: ds_write_b32 v0, v1 @@ -6554,9 +6554,9 @@ define amdgpu_kernel void @local_agent_seq_cst_monotonic_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: ds_write_b32 v0, v1 @@ -6606,9 +6606,9 @@ define amdgpu_kernel void @local_agent_seq_cst_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 @@ -6623,9 +6623,9 @@ define amdgpu_kernel void @local_agent_seq_cst_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm @@ -6656,9 +6656,9 @@ define amdgpu_kernel void @local_agent_seq_cst_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm @@ -6771,7 +6771,7 @@ define amdgpu_kernel void @local_agent_monotonic_acquire_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: ds_write_b32 v0, v1 @@ -6788,7 +6788,7 @@ define amdgpu_kernel void @local_agent_monotonic_acquire_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: ds_write_b32 v0, v1 @@ -6836,7 +6836,7 @@ define amdgpu_kernel void @local_agent_monotonic_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 @@ -6852,7 +6852,7 @@ define amdgpu_kernel void @local_agent_monotonic_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm @@ -6883,7 +6883,7 @@ define amdgpu_kernel void @local_agent_monotonic_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm @@ -6987,7 +6987,7 @@ define amdgpu_kernel void @local_agent_acquire_acquire_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: ds_write_b32 v0, v1 @@ -7004,7 +7004,7 @@ define amdgpu_kernel void @local_agent_acquire_acquire_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: ds_write_b32 v0, v1 @@ -7052,7 +7052,7 @@ define amdgpu_kernel void @local_agent_acquire_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 @@ -7068,7 +7068,7 @@ define amdgpu_kernel void @local_agent_acquire_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm @@ -7099,7 +7099,7 @@ define amdgpu_kernel void @local_agent_acquire_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm @@ -7202,9 +7202,9 @@ define amdgpu_kernel void @local_agent_release_acquire_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: ds_write_b32 v0, v1 @@ -7220,9 +7220,9 @@ define amdgpu_kernel void @local_agent_release_acquire_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: ds_write_b32 v0, v1 @@ -7272,9 +7272,9 @@ define amdgpu_kernel void @local_agent_release_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 @@ -7289,9 +7289,9 @@ define amdgpu_kernel void @local_agent_release_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm @@ -7322,9 +7322,9 @@ define amdgpu_kernel void @local_agent_release_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm @@ -7436,9 +7436,9 @@ define amdgpu_kernel void @local_agent_acq_rel_acquire_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: ds_write_b32 v0, v1 @@ -7454,9 +7454,9 @@ define amdgpu_kernel void @local_agent_acq_rel_acquire_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: ds_write_b32 v0, v1 @@ -7506,9 +7506,9 @@ define amdgpu_kernel void @local_agent_acq_rel_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 @@ -7523,9 +7523,9 @@ define amdgpu_kernel void @local_agent_acq_rel_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm @@ -7556,9 +7556,9 @@ define amdgpu_kernel void @local_agent_acq_rel_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm @@ -7670,9 +7670,9 @@ define amdgpu_kernel void @local_agent_seq_cst_acquire_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: ds_write_b32 v0, v1 @@ -7688,9 +7688,9 @@ define amdgpu_kernel void @local_agent_seq_cst_acquire_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: ds_write_b32 v0, v1 @@ -7740,9 +7740,9 @@ define amdgpu_kernel void @local_agent_seq_cst_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 @@ -7757,9 +7757,9 @@ define amdgpu_kernel void @local_agent_seq_cst_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm @@ -7790,9 +7790,9 @@ define amdgpu_kernel void @local_agent_seq_cst_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm @@ -7904,9 +7904,9 @@ define amdgpu_kernel void @local_agent_monotonic_seq_cst_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: ds_write_b32 v0, v1 @@ -7922,9 +7922,9 @@ define amdgpu_kernel void @local_agent_monotonic_seq_cst_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: ds_write_b32 v0, v1 @@ -7974,9 +7974,9 @@ define amdgpu_kernel void @local_agent_monotonic_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 @@ -7991,9 +7991,9 @@ define amdgpu_kernel void @local_agent_monotonic_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm @@ -8024,9 +8024,9 @@ define amdgpu_kernel void @local_agent_monotonic_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm @@ -8138,9 +8138,9 @@ define amdgpu_kernel void @local_agent_acquire_seq_cst_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: ds_write_b32 v0, v1 @@ -8156,9 +8156,9 @@ define amdgpu_kernel void @local_agent_acquire_seq_cst_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: ds_write_b32 v0, v1 @@ -8208,9 +8208,9 @@ define amdgpu_kernel void @local_agent_acquire_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 @@ -8225,9 +8225,9 @@ define amdgpu_kernel void @local_agent_acquire_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm @@ -8258,9 +8258,9 @@ define amdgpu_kernel void @local_agent_acquire_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm @@ -8372,9 +8372,9 @@ define amdgpu_kernel void @local_agent_release_seq_cst_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: ds_write_b32 v0, v1 @@ -8390,9 +8390,9 @@ define amdgpu_kernel void @local_agent_release_seq_cst_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: ds_write_b32 v0, v1 @@ -8442,9 +8442,9 @@ define amdgpu_kernel void @local_agent_release_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 @@ -8459,9 +8459,9 @@ define amdgpu_kernel void @local_agent_release_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm @@ -8492,9 +8492,9 @@ define amdgpu_kernel void @local_agent_release_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm @@ -8606,9 +8606,9 @@ define amdgpu_kernel void @local_agent_acq_rel_seq_cst_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: ds_write_b32 v0, v1 @@ -8624,9 +8624,9 @@ define amdgpu_kernel void @local_agent_acq_rel_seq_cst_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: ds_write_b32 v0, v1 @@ -8676,9 +8676,9 @@ define amdgpu_kernel void @local_agent_acq_rel_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 @@ -8693,9 +8693,9 @@ define amdgpu_kernel void @local_agent_acq_rel_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm @@ -8726,9 +8726,9 @@ define amdgpu_kernel void @local_agent_acq_rel_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm @@ -8840,9 +8840,9 @@ define amdgpu_kernel void @local_agent_seq_cst_seq_cst_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: ds_write_b32 v0, v1 @@ -8858,9 +8858,9 @@ define amdgpu_kernel void @local_agent_seq_cst_seq_cst_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: ds_write_b32 v0, v1 @@ -8910,9 +8910,9 @@ define amdgpu_kernel void @local_agent_seq_cst_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 @@ -8927,9 +8927,9 @@ define amdgpu_kernel void @local_agent_seq_cst_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm @@ -8960,9 +8960,9 @@ define amdgpu_kernel void @local_agent_seq_cst_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm @@ -9415,6 +9415,7 @@ define amdgpu_kernel void @local_agent_one_as_acquire_load( ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: ds_read_b32 v1, v0 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -9429,6 +9430,7 @@ define amdgpu_kernel void @local_agent_one_as_acquire_load( ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: ds_read_b32 v1, v0 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -9467,6 +9469,7 @@ define amdgpu_kernel void @local_agent_one_as_acquire_load( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: ds_read_b32 v1, v0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -9480,6 +9483,7 @@ define amdgpu_kernel void @local_agent_one_as_acquire_load( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: ds_read_b32 v1, v0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -9504,6 +9508,7 @@ define amdgpu_kernel void @local_agent_one_as_acquire_load( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: ds_read_b32 v1, v0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -9585,7 +9590,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_load( ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_read_b32 v1, v0 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -9599,7 +9606,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_load( ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_read_b32 v1, v0 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -9637,7 +9646,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_load( ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_read_b32 v1, v0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -9650,7 +9661,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_load( ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_read_b32 v1, v0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -9674,7 +9687,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_load( ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_read_b32 v1, v0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -10036,6 +10051,7 @@ define amdgpu_kernel void @local_agent_one_as_release_store( ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm ; @@ -10047,6 +10063,7 @@ define amdgpu_kernel void @local_agent_one_as_release_store( ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: v_mov_b32_e32 v1, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_write_b32 v0, v1 ; GFX7-NEXT: s_endpgm ; @@ -10078,6 +10095,7 @@ define amdgpu_kernel void @local_agent_one_as_release_store( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -10088,6 +10106,7 @@ define amdgpu_kernel void @local_agent_one_as_release_store( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -10108,6 +10127,7 @@ define amdgpu_kernel void @local_agent_one_as_release_store( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -10176,6 +10196,7 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_store( ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm ; @@ -10187,6 +10208,7 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_store( ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: v_mov_b32_e32 v1, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_write_b32 v0, v1 ; GFX7-NEXT: s_endpgm ; @@ -10218,6 +10240,7 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_store( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -10228,6 +10251,7 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_store( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -10248,6 +10272,7 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_store( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -10457,6 +10482,7 @@ define amdgpu_kernel void @local_agent_one_as_acquire_atomicrmw( ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_agent_one_as_acquire_atomicrmw: @@ -10468,6 +10494,7 @@ define amdgpu_kernel void @local_agent_one_as_acquire_atomicrmw( ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: v_mov_b32_e32 v1, s4 ; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_agent_one_as_acquire_atomicrmw: @@ -10499,6 +10526,7 @@ define amdgpu_kernel void @local_agent_one_as_acquire_atomicrmw( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_acquire_atomicrmw: @@ -10509,6 +10537,7 @@ define amdgpu_kernel void @local_agent_one_as_acquire_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_agent_one_as_acquire_atomicrmw: @@ -10529,6 +10558,7 @@ define amdgpu_kernel void @local_agent_one_as_acquire_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_agent_one_as_acquire_atomicrmw: @@ -10596,6 +10626,7 @@ define amdgpu_kernel void @local_agent_one_as_release_atomicrmw( ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX6-NEXT: s_endpgm ; @@ -10607,6 +10638,7 @@ define amdgpu_kernel void @local_agent_one_as_release_atomicrmw( ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: v_mov_b32_e32 v1, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX7-NEXT: s_endpgm ; @@ -10638,6 +10670,7 @@ define amdgpu_kernel void @local_agent_one_as_release_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -10648,6 +10681,7 @@ define amdgpu_kernel void @local_agent_one_as_release_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -10668,6 +10702,7 @@ define amdgpu_kernel void @local_agent_one_as_release_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -10736,7 +10771,9 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_atomicrmw( ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_agent_one_as_acq_rel_atomicrmw: @@ -10747,7 +10784,9 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_atomicrmw( ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: v_mov_b32_e32 v1, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_agent_one_as_acq_rel_atomicrmw: @@ -10778,7 +10817,9 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_acq_rel_atomicrmw: @@ -10788,7 +10829,9 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_agent_one_as_acq_rel_atomicrmw: @@ -10808,7 +10851,9 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_agent_one_as_acq_rel_atomicrmw: @@ -10876,7 +10921,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_atomicrmw( ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_agent_one_as_seq_cst_atomicrmw: @@ -10887,7 +10934,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_atomicrmw( ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: v_mov_b32_e32 v1, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_agent_one_as_seq_cst_atomicrmw: @@ -10918,7 +10967,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_seq_cst_atomicrmw: @@ -10928,7 +10979,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_agent_one_as_seq_cst_atomicrmw: @@ -10948,7 +11001,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_agent_one_as_seq_cst_atomicrmw: @@ -11017,6 +11072,7 @@ define amdgpu_kernel void @local_agent_one_as_acquire_ret_atomicrmw( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -11032,6 +11088,7 @@ define amdgpu_kernel void @local_agent_one_as_acquire_ret_atomicrmw( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -11073,6 +11130,7 @@ define amdgpu_kernel void @local_agent_one_as_acquire_ret_atomicrmw( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -11087,6 +11145,7 @@ define amdgpu_kernel void @local_agent_one_as_acquire_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -11113,6 +11172,7 @@ define amdgpu_kernel void @local_agent_one_as_acquire_ret_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -11199,7 +11259,9 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_ret_atomicrmw( ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -11214,7 +11276,9 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_ret_atomicrmw( ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -11255,7 +11319,9 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_ret_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -11269,7 +11335,9 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -11295,7 +11363,9 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_ret_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -11382,7 +11452,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_ret_atomicrmw( ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -11397,7 +11469,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_ret_atomicrmw( ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -11438,7 +11512,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_ret_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -11452,7 +11528,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -11478,7 +11556,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_ret_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -11735,6 +11815,7 @@ define amdgpu_kernel void @local_agent_one_as_acquire_monotonic_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_agent_one_as_acquire_monotonic_cmpxchg: @@ -11748,6 +11829,7 @@ define amdgpu_kernel void @local_agent_one_as_acquire_monotonic_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_agent_one_as_acquire_monotonic_cmpxchg: @@ -11785,6 +11867,7 @@ define amdgpu_kernel void @local_agent_one_as_acquire_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_acquire_monotonic_cmpxchg: @@ -11797,6 +11880,7 @@ define amdgpu_kernel void @local_agent_one_as_acquire_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_agent_one_as_acquire_monotonic_cmpxchg: @@ -11821,6 +11905,7 @@ define amdgpu_kernel void @local_agent_one_as_acquire_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_agent_one_as_acquire_monotonic_cmpxchg: @@ -11901,6 +11986,7 @@ define amdgpu_kernel void @local_agent_one_as_release_monotonic_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX6-NEXT: s_endpgm ; @@ -11914,6 +12000,7 @@ define amdgpu_kernel void @local_agent_one_as_release_monotonic_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX7-NEXT: s_endpgm ; @@ -11951,6 +12038,7 @@ define amdgpu_kernel void @local_agent_one_as_release_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -11963,6 +12051,7 @@ define amdgpu_kernel void @local_agent_one_as_release_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -11987,6 +12076,7 @@ define amdgpu_kernel void @local_agent_one_as_release_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -12068,7 +12158,9 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_monotonic_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_agent_one_as_acq_rel_monotonic_cmpxchg: @@ -12081,7 +12173,9 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_monotonic_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_agent_one_as_acq_rel_monotonic_cmpxchg: @@ -12118,7 +12212,9 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_acq_rel_monotonic_cmpxchg: @@ -12130,7 +12226,9 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_agent_one_as_acq_rel_monotonic_cmpxchg: @@ -12154,7 +12252,9 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_agent_one_as_acq_rel_monotonic_cmpxchg: @@ -12235,7 +12335,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_monotonic_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_agent_one_as_seq_cst_monotonic_cmpxchg: @@ -12248,7 +12350,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_monotonic_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_agent_one_as_seq_cst_monotonic_cmpxchg: @@ -12285,7 +12389,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_seq_cst_monotonic_cmpxchg: @@ -12297,7 +12403,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_agent_one_as_seq_cst_monotonic_cmpxchg: @@ -12321,7 +12429,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_agent_one_as_seq_cst_monotonic_cmpxchg: @@ -12403,6 +12513,7 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_acquire_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_agent_one_as_monotonic_acquire_cmpxchg: @@ -12416,6 +12527,7 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_acquire_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_agent_one_as_monotonic_acquire_cmpxchg: @@ -12453,6 +12565,7 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_monotonic_acquire_cmpxchg: @@ -12465,6 +12578,7 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_agent_one_as_monotonic_acquire_cmpxchg: @@ -12489,6 +12603,7 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_agent_one_as_monotonic_acquire_cmpxchg: @@ -12570,6 +12685,7 @@ define amdgpu_kernel void @local_agent_one_as_acquire_acquire_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_agent_one_as_acquire_acquire_cmpxchg: @@ -12583,6 +12699,7 @@ define amdgpu_kernel void @local_agent_one_as_acquire_acquire_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_agent_one_as_acquire_acquire_cmpxchg: @@ -12620,6 +12737,7 @@ define amdgpu_kernel void @local_agent_one_as_acquire_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_acquire_acquire_cmpxchg: @@ -12632,6 +12750,7 @@ define amdgpu_kernel void @local_agent_one_as_acquire_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_agent_one_as_acquire_acquire_cmpxchg: @@ -12656,6 +12775,7 @@ define amdgpu_kernel void @local_agent_one_as_acquire_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_agent_one_as_acquire_acquire_cmpxchg: @@ -12736,7 +12856,9 @@ define amdgpu_kernel void @local_agent_one_as_release_acquire_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_agent_one_as_release_acquire_cmpxchg: @@ -12749,7 +12871,9 @@ define amdgpu_kernel void @local_agent_one_as_release_acquire_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_agent_one_as_release_acquire_cmpxchg: @@ -12786,7 +12910,9 @@ define amdgpu_kernel void @local_agent_one_as_release_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_release_acquire_cmpxchg: @@ -12798,7 +12924,9 @@ define amdgpu_kernel void @local_agent_one_as_release_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_agent_one_as_release_acquire_cmpxchg: @@ -12822,7 +12950,9 @@ define amdgpu_kernel void @local_agent_one_as_release_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_agent_one_as_release_acquire_cmpxchg: @@ -12903,7 +13033,9 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_acquire_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_agent_one_as_acq_rel_acquire_cmpxchg: @@ -12916,7 +13048,9 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_acquire_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_agent_one_as_acq_rel_acquire_cmpxchg: @@ -12953,7 +13087,9 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_acq_rel_acquire_cmpxchg: @@ -12965,7 +13101,9 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_agent_one_as_acq_rel_acquire_cmpxchg: @@ -12989,7 +13127,9 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_agent_one_as_acq_rel_acquire_cmpxchg: @@ -13070,7 +13210,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_acquire_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_agent_one_as_seq_cst_acquire_cmpxchg: @@ -13083,7 +13225,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_acquire_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_agent_one_as_seq_cst_acquire_cmpxchg: @@ -13120,7 +13264,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_seq_cst_acquire_cmpxchg: @@ -13132,7 +13278,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_agent_one_as_seq_cst_acquire_cmpxchg: @@ -13156,7 +13304,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_agent_one_as_seq_cst_acquire_cmpxchg: @@ -13237,7 +13387,9 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_seq_cst_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_agent_one_as_monotonic_seq_cst_cmpxchg: @@ -13250,7 +13402,9 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_agent_one_as_monotonic_seq_cst_cmpxchg: @@ -13287,7 +13441,9 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_monotonic_seq_cst_cmpxchg: @@ -13299,7 +13455,9 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_agent_one_as_monotonic_seq_cst_cmpxchg: @@ -13323,7 +13481,9 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_agent_one_as_monotonic_seq_cst_cmpxchg: @@ -13404,7 +13564,9 @@ define amdgpu_kernel void @local_agent_one_as_acquire_seq_cst_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_agent_one_as_acquire_seq_cst_cmpxchg: @@ -13417,7 +13579,9 @@ define amdgpu_kernel void @local_agent_one_as_acquire_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_agent_one_as_acquire_seq_cst_cmpxchg: @@ -13454,7 +13618,9 @@ define amdgpu_kernel void @local_agent_one_as_acquire_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_acquire_seq_cst_cmpxchg: @@ -13466,7 +13632,9 @@ define amdgpu_kernel void @local_agent_one_as_acquire_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_agent_one_as_acquire_seq_cst_cmpxchg: @@ -13490,7 +13658,9 @@ define amdgpu_kernel void @local_agent_one_as_acquire_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_agent_one_as_acquire_seq_cst_cmpxchg: @@ -13571,7 +13741,9 @@ define amdgpu_kernel void @local_agent_one_as_release_seq_cst_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_agent_one_as_release_seq_cst_cmpxchg: @@ -13584,7 +13756,9 @@ define amdgpu_kernel void @local_agent_one_as_release_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_agent_one_as_release_seq_cst_cmpxchg: @@ -13621,7 +13795,9 @@ define amdgpu_kernel void @local_agent_one_as_release_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_release_seq_cst_cmpxchg: @@ -13633,7 +13809,9 @@ define amdgpu_kernel void @local_agent_one_as_release_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_agent_one_as_release_seq_cst_cmpxchg: @@ -13657,7 +13835,9 @@ define amdgpu_kernel void @local_agent_one_as_release_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_agent_one_as_release_seq_cst_cmpxchg: @@ -13738,7 +13918,9 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_seq_cst_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_agent_one_as_acq_rel_seq_cst_cmpxchg: @@ -13751,7 +13933,9 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_agent_one_as_acq_rel_seq_cst_cmpxchg: @@ -13788,7 +13972,9 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_acq_rel_seq_cst_cmpxchg: @@ -13800,7 +13986,9 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_agent_one_as_acq_rel_seq_cst_cmpxchg: @@ -13824,7 +14012,9 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_agent_one_as_acq_rel_seq_cst_cmpxchg: @@ -13905,7 +14095,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_seq_cst_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_agent_one_as_seq_cst_seq_cst_cmpxchg: @@ -13918,7 +14110,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_agent_one_as_seq_cst_seq_cst_cmpxchg: @@ -13955,7 +14149,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_seq_cst_seq_cst_cmpxchg: @@ -13967,7 +14163,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_agent_one_as_seq_cst_seq_cst_cmpxchg: @@ -13991,7 +14189,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_agent_one_as_seq_cst_seq_cst_cmpxchg: @@ -14284,6 +14484,7 @@ define amdgpu_kernel void @local_agent_one_as_acquire_monotonic_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -14301,6 +14502,7 @@ define amdgpu_kernel void @local_agent_one_as_acquire_monotonic_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -14348,6 +14550,7 @@ define amdgpu_kernel void @local_agent_one_as_acquire_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -14364,6 +14567,7 @@ define amdgpu_kernel void @local_agent_one_as_acquire_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -14394,6 +14598,7 @@ define amdgpu_kernel void @local_agent_one_as_acquire_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -14494,6 +14699,7 @@ define amdgpu_kernel void @local_agent_one_as_release_monotonic_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 @@ -14511,6 +14717,7 @@ define amdgpu_kernel void @local_agent_one_as_release_monotonic_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -14558,6 +14765,7 @@ define amdgpu_kernel void @local_agent_one_as_release_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 @@ -14574,6 +14782,7 @@ define amdgpu_kernel void @local_agent_one_as_release_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -14604,6 +14813,7 @@ define amdgpu_kernel void @local_agent_one_as_release_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -14705,7 +14915,9 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -14722,7 +14934,9 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -14769,7 +14983,9 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -14785,7 +15001,9 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -14815,7 +15033,9 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -14916,7 +15136,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -14933,7 +15155,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -14980,7 +15204,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -14996,7 +15222,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -15026,7 +15254,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -15128,6 +15358,7 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_acquire_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -15145,6 +15376,7 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_acquire_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -15192,6 +15424,7 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -15208,6 +15441,7 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -15238,6 +15472,7 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -15339,6 +15574,7 @@ define amdgpu_kernel void @local_agent_one_as_acquire_acquire_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -15356,6 +15592,7 @@ define amdgpu_kernel void @local_agent_one_as_acquire_acquire_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -15403,6 +15640,7 @@ define amdgpu_kernel void @local_agent_one_as_acquire_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -15419,6 +15657,7 @@ define amdgpu_kernel void @local_agent_one_as_acquire_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -15449,6 +15688,7 @@ define amdgpu_kernel void @local_agent_one_as_acquire_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -15549,7 +15789,9 @@ define amdgpu_kernel void @local_agent_one_as_release_acquire_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -15566,7 +15808,9 @@ define amdgpu_kernel void @local_agent_one_as_release_acquire_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -15613,7 +15857,9 @@ define amdgpu_kernel void @local_agent_one_as_release_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -15629,7 +15875,9 @@ define amdgpu_kernel void @local_agent_one_as_release_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -15659,7 +15907,9 @@ define amdgpu_kernel void @local_agent_one_as_release_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -15760,7 +16010,9 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -15777,7 +16029,9 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -15824,7 +16078,9 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -15840,7 +16096,9 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -15870,7 +16128,9 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -15971,7 +16231,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -15988,7 +16250,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -16035,7 +16299,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -16051,7 +16317,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -16081,7 +16349,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -16182,7 +16452,9 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -16199,7 +16471,9 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -16246,7 +16520,9 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -16262,7 +16538,9 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -16292,7 +16570,9 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -16393,7 +16673,9 @@ define amdgpu_kernel void @local_agent_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -16410,7 +16692,9 @@ define amdgpu_kernel void @local_agent_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -16457,7 +16741,9 @@ define amdgpu_kernel void @local_agent_one_as_acquire_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -16473,7 +16759,9 @@ define amdgpu_kernel void @local_agent_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -16503,7 +16791,9 @@ define amdgpu_kernel void @local_agent_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -16604,7 +16894,9 @@ define amdgpu_kernel void @local_agent_one_as_release_seq_cst_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -16621,7 +16913,9 @@ define amdgpu_kernel void @local_agent_one_as_release_seq_cst_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -16668,7 +16962,9 @@ define amdgpu_kernel void @local_agent_one_as_release_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -16684,7 +16980,9 @@ define amdgpu_kernel void @local_agent_one_as_release_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -16714,7 +17012,9 @@ define amdgpu_kernel void @local_agent_one_as_release_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -16815,7 +17115,9 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -16832,7 +17134,9 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -16879,7 +17183,9 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -16895,7 +17201,9 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -16925,7 +17233,9 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -17026,7 +17336,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -17043,7 +17355,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -17090,7 +17404,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -17106,7 +17422,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -17136,7 +17454,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-nontemporal.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-nontemporal.ll index 78209ee34cad4..48b3b26eae084 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-nontemporal.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-nontemporal.ll @@ -830,7 +830,7 @@ define amdgpu_kernel void @local_nontemporal_volatile_load( ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s8 ; GFX6-NEXT: ds_read_b32 v0, v0 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; @@ -845,6 +845,7 @@ define amdgpu_kernel void @local_nontemporal_volatile_load( ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ds_read_b32 v2, v0 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -892,7 +893,7 @@ define amdgpu_kernel void @local_nontemporal_volatile_load( ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 ; SKIP-CACHE-INV-NEXT: ds_read_b32 v0, v0 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -904,7 +905,7 @@ define amdgpu_kernel void @local_nontemporal_volatile_load( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ds_read_b32 v1, v1 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -928,7 +929,7 @@ define amdgpu_kernel void @local_nontemporal_volatile_load( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ds_read_b32 v1, v1 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-singlethread.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-singlethread.ll index f84d451f8ecb0..b240f51241a04 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-singlethread.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-singlethread.ll @@ -366,6 +366,7 @@ define amdgpu_kernel void @local_singlethread_acquire_load( ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: ds_read_b32 v1, v0 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -380,6 +381,7 @@ define amdgpu_kernel void @local_singlethread_acquire_load( ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: ds_read_b32 v1, v0 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -418,6 +420,7 @@ define amdgpu_kernel void @local_singlethread_acquire_load( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: ds_read_b32 v1, v0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -431,6 +434,7 @@ define amdgpu_kernel void @local_singlethread_acquire_load( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: ds_read_b32 v1, v0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -455,6 +459,7 @@ define amdgpu_kernel void @local_singlethread_acquire_load( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: ds_read_b32 v1, v0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -536,7 +541,9 @@ define amdgpu_kernel void @local_singlethread_seq_cst_load( ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_read_b32 v1, v0 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -550,7 +557,9 @@ define amdgpu_kernel void @local_singlethread_seq_cst_load( ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_read_b32 v1, v0 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -588,7 +597,9 @@ define amdgpu_kernel void @local_singlethread_seq_cst_load( ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_read_b32 v1, v0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -601,7 +612,9 @@ define amdgpu_kernel void @local_singlethread_seq_cst_load( ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_read_b32 v1, v0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -625,7 +638,9 @@ define amdgpu_kernel void @local_singlethread_seq_cst_load( ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_read_b32 v1, v0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -987,6 +1002,7 @@ define amdgpu_kernel void @local_singlethread_release_store( ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm ; @@ -998,6 +1014,7 @@ define amdgpu_kernel void @local_singlethread_release_store( ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: v_mov_b32_e32 v1, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_write_b32 v0, v1 ; GFX7-NEXT: s_endpgm ; @@ -1029,6 +1046,7 @@ define amdgpu_kernel void @local_singlethread_release_store( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -1039,6 +1057,7 @@ define amdgpu_kernel void @local_singlethread_release_store( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -1059,6 +1078,7 @@ define amdgpu_kernel void @local_singlethread_release_store( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -1127,6 +1147,7 @@ define amdgpu_kernel void @local_singlethread_seq_cst_store( ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm ; @@ -1138,6 +1159,7 @@ define amdgpu_kernel void @local_singlethread_seq_cst_store( ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: v_mov_b32_e32 v1, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_write_b32 v0, v1 ; GFX7-NEXT: s_endpgm ; @@ -1169,6 +1191,7 @@ define amdgpu_kernel void @local_singlethread_seq_cst_store( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -1179,6 +1202,7 @@ define amdgpu_kernel void @local_singlethread_seq_cst_store( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -1199,6 +1223,7 @@ define amdgpu_kernel void @local_singlethread_seq_cst_store( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -1408,6 +1433,7 @@ define amdgpu_kernel void @local_singlethread_acquire_atomicrmw( ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_singlethread_acquire_atomicrmw: @@ -1419,6 +1445,7 @@ define amdgpu_kernel void @local_singlethread_acquire_atomicrmw( ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: v_mov_b32_e32 v1, s4 ; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_singlethread_acquire_atomicrmw: @@ -1450,6 +1477,7 @@ define amdgpu_kernel void @local_singlethread_acquire_atomicrmw( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_acquire_atomicrmw: @@ -1460,6 +1488,7 @@ define amdgpu_kernel void @local_singlethread_acquire_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_singlethread_acquire_atomicrmw: @@ -1480,6 +1509,7 @@ define amdgpu_kernel void @local_singlethread_acquire_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_singlethread_acquire_atomicrmw: @@ -1547,6 +1577,7 @@ define amdgpu_kernel void @local_singlethread_release_atomicrmw( ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX6-NEXT: s_endpgm ; @@ -1558,6 +1589,7 @@ define amdgpu_kernel void @local_singlethread_release_atomicrmw( ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: v_mov_b32_e32 v1, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX7-NEXT: s_endpgm ; @@ -1589,6 +1621,7 @@ define amdgpu_kernel void @local_singlethread_release_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -1599,6 +1632,7 @@ define amdgpu_kernel void @local_singlethread_release_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -1619,6 +1653,7 @@ define amdgpu_kernel void @local_singlethread_release_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -1687,7 +1722,9 @@ define amdgpu_kernel void @local_singlethread_acq_rel_atomicrmw( ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_singlethread_acq_rel_atomicrmw: @@ -1698,7 +1735,9 @@ define amdgpu_kernel void @local_singlethread_acq_rel_atomicrmw( ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: v_mov_b32_e32 v1, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_singlethread_acq_rel_atomicrmw: @@ -1729,7 +1768,9 @@ define amdgpu_kernel void @local_singlethread_acq_rel_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_acq_rel_atomicrmw: @@ -1739,7 +1780,9 @@ define amdgpu_kernel void @local_singlethread_acq_rel_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_singlethread_acq_rel_atomicrmw: @@ -1759,7 +1802,9 @@ define amdgpu_kernel void @local_singlethread_acq_rel_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_singlethread_acq_rel_atomicrmw: @@ -1827,7 +1872,9 @@ define amdgpu_kernel void @local_singlethread_seq_cst_atomicrmw( ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_singlethread_seq_cst_atomicrmw: @@ -1838,7 +1885,9 @@ define amdgpu_kernel void @local_singlethread_seq_cst_atomicrmw( ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: v_mov_b32_e32 v1, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_singlethread_seq_cst_atomicrmw: @@ -1869,7 +1918,9 @@ define amdgpu_kernel void @local_singlethread_seq_cst_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_seq_cst_atomicrmw: @@ -1879,7 +1930,9 @@ define amdgpu_kernel void @local_singlethread_seq_cst_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_singlethread_seq_cst_atomicrmw: @@ -1899,7 +1952,9 @@ define amdgpu_kernel void @local_singlethread_seq_cst_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_singlethread_seq_cst_atomicrmw: @@ -1968,6 +2023,7 @@ define amdgpu_kernel void @local_singlethread_acquire_ret_atomicrmw( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -1983,6 +2039,7 @@ define amdgpu_kernel void @local_singlethread_acquire_ret_atomicrmw( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -2024,6 +2081,7 @@ define amdgpu_kernel void @local_singlethread_acquire_ret_atomicrmw( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -2038,6 +2096,7 @@ define amdgpu_kernel void @local_singlethread_acquire_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -2064,6 +2123,7 @@ define amdgpu_kernel void @local_singlethread_acquire_ret_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -2150,7 +2210,9 @@ define amdgpu_kernel void @local_singlethread_acq_rel_ret_atomicrmw( ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -2165,7 +2227,9 @@ define amdgpu_kernel void @local_singlethread_acq_rel_ret_atomicrmw( ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -2206,7 +2270,9 @@ define amdgpu_kernel void @local_singlethread_acq_rel_ret_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -2220,7 +2286,9 @@ define amdgpu_kernel void @local_singlethread_acq_rel_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -2246,7 +2314,9 @@ define amdgpu_kernel void @local_singlethread_acq_rel_ret_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -2333,7 +2403,9 @@ define amdgpu_kernel void @local_singlethread_seq_cst_ret_atomicrmw( ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -2348,7 +2420,9 @@ define amdgpu_kernel void @local_singlethread_seq_cst_ret_atomicrmw( ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -2389,7 +2463,9 @@ define amdgpu_kernel void @local_singlethread_seq_cst_ret_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -2403,7 +2479,9 @@ define amdgpu_kernel void @local_singlethread_seq_cst_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -2429,7 +2507,9 @@ define amdgpu_kernel void @local_singlethread_seq_cst_ret_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -2686,6 +2766,7 @@ define amdgpu_kernel void @local_singlethread_acquire_monotonic_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_singlethread_acquire_monotonic_cmpxchg: @@ -2699,6 +2780,7 @@ define amdgpu_kernel void @local_singlethread_acquire_monotonic_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_singlethread_acquire_monotonic_cmpxchg: @@ -2736,6 +2818,7 @@ define amdgpu_kernel void @local_singlethread_acquire_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_acquire_monotonic_cmpxchg: @@ -2748,6 +2831,7 @@ define amdgpu_kernel void @local_singlethread_acquire_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_singlethread_acquire_monotonic_cmpxchg: @@ -2772,6 +2856,7 @@ define amdgpu_kernel void @local_singlethread_acquire_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_singlethread_acquire_monotonic_cmpxchg: @@ -2852,6 +2937,7 @@ define amdgpu_kernel void @local_singlethread_release_monotonic_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX6-NEXT: s_endpgm ; @@ -2865,6 +2951,7 @@ define amdgpu_kernel void @local_singlethread_release_monotonic_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX7-NEXT: s_endpgm ; @@ -2902,6 +2989,7 @@ define amdgpu_kernel void @local_singlethread_release_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -2914,6 +3002,7 @@ define amdgpu_kernel void @local_singlethread_release_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -2938,6 +3027,7 @@ define amdgpu_kernel void @local_singlethread_release_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -3019,7 +3109,9 @@ define amdgpu_kernel void @local_singlethread_acq_rel_monotonic_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_singlethread_acq_rel_monotonic_cmpxchg: @@ -3032,7 +3124,9 @@ define amdgpu_kernel void @local_singlethread_acq_rel_monotonic_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_singlethread_acq_rel_monotonic_cmpxchg: @@ -3069,7 +3163,9 @@ define amdgpu_kernel void @local_singlethread_acq_rel_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_acq_rel_monotonic_cmpxchg: @@ -3081,7 +3177,9 @@ define amdgpu_kernel void @local_singlethread_acq_rel_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_singlethread_acq_rel_monotonic_cmpxchg: @@ -3105,7 +3203,9 @@ define amdgpu_kernel void @local_singlethread_acq_rel_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_singlethread_acq_rel_monotonic_cmpxchg: @@ -3186,7 +3286,9 @@ define amdgpu_kernel void @local_singlethread_seq_cst_monotonic_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_singlethread_seq_cst_monotonic_cmpxchg: @@ -3199,7 +3301,9 @@ define amdgpu_kernel void @local_singlethread_seq_cst_monotonic_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_singlethread_seq_cst_monotonic_cmpxchg: @@ -3236,7 +3340,9 @@ define amdgpu_kernel void @local_singlethread_seq_cst_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_seq_cst_monotonic_cmpxchg: @@ -3248,7 +3354,9 @@ define amdgpu_kernel void @local_singlethread_seq_cst_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_singlethread_seq_cst_monotonic_cmpxchg: @@ -3272,7 +3380,9 @@ define amdgpu_kernel void @local_singlethread_seq_cst_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_singlethread_seq_cst_monotonic_cmpxchg: @@ -3354,6 +3464,7 @@ define amdgpu_kernel void @local_singlethread_monotonic_acquire_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_singlethread_monotonic_acquire_cmpxchg: @@ -3367,6 +3478,7 @@ define amdgpu_kernel void @local_singlethread_monotonic_acquire_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_singlethread_monotonic_acquire_cmpxchg: @@ -3404,6 +3516,7 @@ define amdgpu_kernel void @local_singlethread_monotonic_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_monotonic_acquire_cmpxchg: @@ -3416,6 +3529,7 @@ define amdgpu_kernel void @local_singlethread_monotonic_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_singlethread_monotonic_acquire_cmpxchg: @@ -3440,6 +3554,7 @@ define amdgpu_kernel void @local_singlethread_monotonic_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_singlethread_monotonic_acquire_cmpxchg: @@ -3521,6 +3636,7 @@ define amdgpu_kernel void @local_singlethread_acquire_acquire_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_singlethread_acquire_acquire_cmpxchg: @@ -3534,6 +3650,7 @@ define amdgpu_kernel void @local_singlethread_acquire_acquire_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_singlethread_acquire_acquire_cmpxchg: @@ -3571,6 +3688,7 @@ define amdgpu_kernel void @local_singlethread_acquire_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_acquire_acquire_cmpxchg: @@ -3583,6 +3701,7 @@ define amdgpu_kernel void @local_singlethread_acquire_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_singlethread_acquire_acquire_cmpxchg: @@ -3607,6 +3726,7 @@ define amdgpu_kernel void @local_singlethread_acquire_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_singlethread_acquire_acquire_cmpxchg: @@ -3687,7 +3807,9 @@ define amdgpu_kernel void @local_singlethread_release_acquire_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_singlethread_release_acquire_cmpxchg: @@ -3700,7 +3822,9 @@ define amdgpu_kernel void @local_singlethread_release_acquire_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_singlethread_release_acquire_cmpxchg: @@ -3737,7 +3861,9 @@ define amdgpu_kernel void @local_singlethread_release_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_release_acquire_cmpxchg: @@ -3749,7 +3875,9 @@ define amdgpu_kernel void @local_singlethread_release_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_singlethread_release_acquire_cmpxchg: @@ -3773,7 +3901,9 @@ define amdgpu_kernel void @local_singlethread_release_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_singlethread_release_acquire_cmpxchg: @@ -3854,7 +3984,9 @@ define amdgpu_kernel void @local_singlethread_acq_rel_acquire_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_singlethread_acq_rel_acquire_cmpxchg: @@ -3867,7 +3999,9 @@ define amdgpu_kernel void @local_singlethread_acq_rel_acquire_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_singlethread_acq_rel_acquire_cmpxchg: @@ -3904,7 +4038,9 @@ define amdgpu_kernel void @local_singlethread_acq_rel_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_acq_rel_acquire_cmpxchg: @@ -3916,7 +4052,9 @@ define amdgpu_kernel void @local_singlethread_acq_rel_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_singlethread_acq_rel_acquire_cmpxchg: @@ -3940,7 +4078,9 @@ define amdgpu_kernel void @local_singlethread_acq_rel_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_singlethread_acq_rel_acquire_cmpxchg: @@ -4021,7 +4161,9 @@ define amdgpu_kernel void @local_singlethread_seq_cst_acquire_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_singlethread_seq_cst_acquire_cmpxchg: @@ -4034,7 +4176,9 @@ define amdgpu_kernel void @local_singlethread_seq_cst_acquire_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_singlethread_seq_cst_acquire_cmpxchg: @@ -4071,7 +4215,9 @@ define amdgpu_kernel void @local_singlethread_seq_cst_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_seq_cst_acquire_cmpxchg: @@ -4083,7 +4229,9 @@ define amdgpu_kernel void @local_singlethread_seq_cst_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_singlethread_seq_cst_acquire_cmpxchg: @@ -4107,7 +4255,9 @@ define amdgpu_kernel void @local_singlethread_seq_cst_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_singlethread_seq_cst_acquire_cmpxchg: @@ -4188,7 +4338,9 @@ define amdgpu_kernel void @local_singlethread_monotonic_seq_cst_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_singlethread_monotonic_seq_cst_cmpxchg: @@ -4201,7 +4353,9 @@ define amdgpu_kernel void @local_singlethread_monotonic_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_singlethread_monotonic_seq_cst_cmpxchg: @@ -4238,7 +4392,9 @@ define amdgpu_kernel void @local_singlethread_monotonic_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_monotonic_seq_cst_cmpxchg: @@ -4250,7 +4406,9 @@ define amdgpu_kernel void @local_singlethread_monotonic_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_singlethread_monotonic_seq_cst_cmpxchg: @@ -4274,7 +4432,9 @@ define amdgpu_kernel void @local_singlethread_monotonic_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_singlethread_monotonic_seq_cst_cmpxchg: @@ -4355,7 +4515,9 @@ define amdgpu_kernel void @local_singlethread_acquire_seq_cst_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_singlethread_acquire_seq_cst_cmpxchg: @@ -4368,7 +4530,9 @@ define amdgpu_kernel void @local_singlethread_acquire_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_singlethread_acquire_seq_cst_cmpxchg: @@ -4405,7 +4569,9 @@ define amdgpu_kernel void @local_singlethread_acquire_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_acquire_seq_cst_cmpxchg: @@ -4417,7 +4583,9 @@ define amdgpu_kernel void @local_singlethread_acquire_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_singlethread_acquire_seq_cst_cmpxchg: @@ -4441,7 +4609,9 @@ define amdgpu_kernel void @local_singlethread_acquire_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_singlethread_acquire_seq_cst_cmpxchg: @@ -4522,7 +4692,9 @@ define amdgpu_kernel void @local_singlethread_release_seq_cst_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_singlethread_release_seq_cst_cmpxchg: @@ -4535,7 +4707,9 @@ define amdgpu_kernel void @local_singlethread_release_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_singlethread_release_seq_cst_cmpxchg: @@ -4572,7 +4746,9 @@ define amdgpu_kernel void @local_singlethread_release_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_release_seq_cst_cmpxchg: @@ -4584,7 +4760,9 @@ define amdgpu_kernel void @local_singlethread_release_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_singlethread_release_seq_cst_cmpxchg: @@ -4608,7 +4786,9 @@ define amdgpu_kernel void @local_singlethread_release_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_singlethread_release_seq_cst_cmpxchg: @@ -4689,7 +4869,9 @@ define amdgpu_kernel void @local_singlethread_acq_rel_seq_cst_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_singlethread_acq_rel_seq_cst_cmpxchg: @@ -4702,7 +4884,9 @@ define amdgpu_kernel void @local_singlethread_acq_rel_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_singlethread_acq_rel_seq_cst_cmpxchg: @@ -4739,7 +4923,9 @@ define amdgpu_kernel void @local_singlethread_acq_rel_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_acq_rel_seq_cst_cmpxchg: @@ -4751,7 +4937,9 @@ define amdgpu_kernel void @local_singlethread_acq_rel_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_singlethread_acq_rel_seq_cst_cmpxchg: @@ -4775,7 +4963,9 @@ define amdgpu_kernel void @local_singlethread_acq_rel_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_singlethread_acq_rel_seq_cst_cmpxchg: @@ -4856,7 +5046,9 @@ define amdgpu_kernel void @local_singlethread_seq_cst_seq_cst_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_singlethread_seq_cst_seq_cst_cmpxchg: @@ -4869,7 +5061,9 @@ define amdgpu_kernel void @local_singlethread_seq_cst_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_singlethread_seq_cst_seq_cst_cmpxchg: @@ -4906,7 +5100,9 @@ define amdgpu_kernel void @local_singlethread_seq_cst_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_seq_cst_seq_cst_cmpxchg: @@ -4918,7 +5114,9 @@ define amdgpu_kernel void @local_singlethread_seq_cst_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_singlethread_seq_cst_seq_cst_cmpxchg: @@ -4942,7 +5140,9 @@ define amdgpu_kernel void @local_singlethread_seq_cst_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_singlethread_seq_cst_seq_cst_cmpxchg: @@ -5235,6 +5435,7 @@ define amdgpu_kernel void @local_singlethread_acquire_monotonic_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -5252,6 +5453,7 @@ define amdgpu_kernel void @local_singlethread_acquire_monotonic_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -5299,6 +5501,7 @@ define amdgpu_kernel void @local_singlethread_acquire_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -5315,6 +5518,7 @@ define amdgpu_kernel void @local_singlethread_acquire_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -5345,6 +5549,7 @@ define amdgpu_kernel void @local_singlethread_acquire_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -5445,6 +5650,7 @@ define amdgpu_kernel void @local_singlethread_release_monotonic_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 @@ -5462,6 +5668,7 @@ define amdgpu_kernel void @local_singlethread_release_monotonic_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -5509,6 +5716,7 @@ define amdgpu_kernel void @local_singlethread_release_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 @@ -5525,6 +5733,7 @@ define amdgpu_kernel void @local_singlethread_release_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -5555,6 +5764,7 @@ define amdgpu_kernel void @local_singlethread_release_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -5656,7 +5866,9 @@ define amdgpu_kernel void @local_singlethread_acq_rel_monotonic_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -5673,7 +5885,9 @@ define amdgpu_kernel void @local_singlethread_acq_rel_monotonic_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -5720,7 +5934,9 @@ define amdgpu_kernel void @local_singlethread_acq_rel_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -5736,7 +5952,9 @@ define amdgpu_kernel void @local_singlethread_acq_rel_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -5766,7 +5984,9 @@ define amdgpu_kernel void @local_singlethread_acq_rel_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -5867,7 +6087,9 @@ define amdgpu_kernel void @local_singlethread_seq_cst_monotonic_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -5884,7 +6106,9 @@ define amdgpu_kernel void @local_singlethread_seq_cst_monotonic_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -5931,7 +6155,9 @@ define amdgpu_kernel void @local_singlethread_seq_cst_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -5947,7 +6173,9 @@ define amdgpu_kernel void @local_singlethread_seq_cst_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -5977,7 +6205,9 @@ define amdgpu_kernel void @local_singlethread_seq_cst_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -6079,6 +6309,7 @@ define amdgpu_kernel void @local_singlethread_monotonic_acquire_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -6096,6 +6327,7 @@ define amdgpu_kernel void @local_singlethread_monotonic_acquire_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -6143,6 +6375,7 @@ define amdgpu_kernel void @local_singlethread_monotonic_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -6159,6 +6392,7 @@ define amdgpu_kernel void @local_singlethread_monotonic_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -6189,6 +6423,7 @@ define amdgpu_kernel void @local_singlethread_monotonic_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -6290,6 +6525,7 @@ define amdgpu_kernel void @local_singlethread_acquire_acquire_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -6307,6 +6543,7 @@ define amdgpu_kernel void @local_singlethread_acquire_acquire_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -6354,6 +6591,7 @@ define amdgpu_kernel void @local_singlethread_acquire_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -6370,6 +6608,7 @@ define amdgpu_kernel void @local_singlethread_acquire_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -6400,6 +6639,7 @@ define amdgpu_kernel void @local_singlethread_acquire_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -6500,7 +6740,9 @@ define amdgpu_kernel void @local_singlethread_release_acquire_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -6517,7 +6759,9 @@ define amdgpu_kernel void @local_singlethread_release_acquire_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -6564,7 +6808,9 @@ define amdgpu_kernel void @local_singlethread_release_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -6580,7 +6826,9 @@ define amdgpu_kernel void @local_singlethread_release_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -6610,7 +6858,9 @@ define amdgpu_kernel void @local_singlethread_release_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -6711,7 +6961,9 @@ define amdgpu_kernel void @local_singlethread_acq_rel_acquire_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -6728,7 +6980,9 @@ define amdgpu_kernel void @local_singlethread_acq_rel_acquire_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -6775,7 +7029,9 @@ define amdgpu_kernel void @local_singlethread_acq_rel_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -6791,7 +7047,9 @@ define amdgpu_kernel void @local_singlethread_acq_rel_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -6821,7 +7079,9 @@ define amdgpu_kernel void @local_singlethread_acq_rel_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -6922,7 +7182,9 @@ define amdgpu_kernel void @local_singlethread_seq_cst_acquire_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -6939,7 +7201,9 @@ define amdgpu_kernel void @local_singlethread_seq_cst_acquire_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -6986,7 +7250,9 @@ define amdgpu_kernel void @local_singlethread_seq_cst_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -7002,7 +7268,9 @@ define amdgpu_kernel void @local_singlethread_seq_cst_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -7032,7 +7300,9 @@ define amdgpu_kernel void @local_singlethread_seq_cst_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -7133,7 +7403,9 @@ define amdgpu_kernel void @local_singlethread_monotonic_seq_cst_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -7150,7 +7422,9 @@ define amdgpu_kernel void @local_singlethread_monotonic_seq_cst_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -7197,7 +7471,9 @@ define amdgpu_kernel void @local_singlethread_monotonic_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -7213,7 +7489,9 @@ define amdgpu_kernel void @local_singlethread_monotonic_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -7243,7 +7521,9 @@ define amdgpu_kernel void @local_singlethread_monotonic_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -7344,7 +7624,9 @@ define amdgpu_kernel void @local_singlethread_acquire_seq_cst_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -7361,7 +7643,9 @@ define amdgpu_kernel void @local_singlethread_acquire_seq_cst_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -7408,7 +7692,9 @@ define amdgpu_kernel void @local_singlethread_acquire_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -7424,7 +7710,9 @@ define amdgpu_kernel void @local_singlethread_acquire_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -7454,7 +7742,9 @@ define amdgpu_kernel void @local_singlethread_acquire_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -7555,7 +7845,9 @@ define amdgpu_kernel void @local_singlethread_release_seq_cst_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -7572,7 +7864,9 @@ define amdgpu_kernel void @local_singlethread_release_seq_cst_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -7619,7 +7913,9 @@ define amdgpu_kernel void @local_singlethread_release_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -7635,7 +7931,9 @@ define amdgpu_kernel void @local_singlethread_release_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -7665,7 +7963,9 @@ define amdgpu_kernel void @local_singlethread_release_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -7766,7 +8066,9 @@ define amdgpu_kernel void @local_singlethread_acq_rel_seq_cst_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -7783,7 +8085,9 @@ define amdgpu_kernel void @local_singlethread_acq_rel_seq_cst_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -7830,7 +8134,9 @@ define amdgpu_kernel void @local_singlethread_acq_rel_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -7846,7 +8152,9 @@ define amdgpu_kernel void @local_singlethread_acq_rel_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -7876,7 +8184,9 @@ define amdgpu_kernel void @local_singlethread_acq_rel_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -7977,7 +8287,9 @@ define amdgpu_kernel void @local_singlethread_seq_cst_seq_cst_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -7994,7 +8306,9 @@ define amdgpu_kernel void @local_singlethread_seq_cst_seq_cst_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -8041,7 +8355,9 @@ define amdgpu_kernel void @local_singlethread_seq_cst_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -8057,7 +8373,9 @@ define amdgpu_kernel void @local_singlethread_seq_cst_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -8087,7 +8405,9 @@ define amdgpu_kernel void @local_singlethread_seq_cst_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -8529,6 +8849,7 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_load( ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: ds_read_b32 v1, v0 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -8543,6 +8864,7 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_load( ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: ds_read_b32 v1, v0 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -8581,6 +8903,7 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_load( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: ds_read_b32 v1, v0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -8594,6 +8917,7 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_load( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: ds_read_b32 v1, v0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -8618,6 +8942,7 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_load( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: ds_read_b32 v1, v0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -8699,7 +9024,9 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_load( ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_read_b32 v1, v0 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -8713,7 +9040,9 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_load( ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_read_b32 v1, v0 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -8751,7 +9080,9 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_load( ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_read_b32 v1, v0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -8764,7 +9095,9 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_load( ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_read_b32 v1, v0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -8788,7 +9121,9 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_load( ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_read_b32 v1, v0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -9150,6 +9485,7 @@ define amdgpu_kernel void @local_singlethread_one_as_release_store( ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm ; @@ -9161,6 +9497,7 @@ define amdgpu_kernel void @local_singlethread_one_as_release_store( ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: v_mov_b32_e32 v1, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_write_b32 v0, v1 ; GFX7-NEXT: s_endpgm ; @@ -9192,6 +9529,7 @@ define amdgpu_kernel void @local_singlethread_one_as_release_store( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -9202,6 +9540,7 @@ define amdgpu_kernel void @local_singlethread_one_as_release_store( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -9222,6 +9561,7 @@ define amdgpu_kernel void @local_singlethread_one_as_release_store( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -9290,6 +9630,7 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_store( ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm ; @@ -9301,6 +9642,7 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_store( ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: v_mov_b32_e32 v1, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_write_b32 v0, v1 ; GFX7-NEXT: s_endpgm ; @@ -9332,6 +9674,7 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_store( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -9342,6 +9685,7 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_store( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -9362,6 +9706,7 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_store( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -9571,6 +9916,7 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_atomicrmw( ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_singlethread_one_as_acquire_atomicrmw: @@ -9582,6 +9928,7 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_atomicrmw( ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: v_mov_b32_e32 v1, s4 ; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_singlethread_one_as_acquire_atomicrmw: @@ -9613,6 +9960,7 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_atomicrmw( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_acquire_atomicrmw: @@ -9623,6 +9971,7 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_acquire_atomicrmw: @@ -9643,6 +9992,7 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_singlethread_one_as_acquire_atomicrmw: @@ -9710,6 +10060,7 @@ define amdgpu_kernel void @local_singlethread_one_as_release_atomicrmw( ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX6-NEXT: s_endpgm ; @@ -9721,6 +10072,7 @@ define amdgpu_kernel void @local_singlethread_one_as_release_atomicrmw( ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: v_mov_b32_e32 v1, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX7-NEXT: s_endpgm ; @@ -9752,6 +10104,7 @@ define amdgpu_kernel void @local_singlethread_one_as_release_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -9762,6 +10115,7 @@ define amdgpu_kernel void @local_singlethread_one_as_release_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -9782,6 +10136,7 @@ define amdgpu_kernel void @local_singlethread_one_as_release_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -9850,7 +10205,9 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_atomicrmw( ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_singlethread_one_as_acq_rel_atomicrmw: @@ -9861,7 +10218,9 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_atomicrmw( ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: v_mov_b32_e32 v1, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_singlethread_one_as_acq_rel_atomicrmw: @@ -9892,7 +10251,9 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_acq_rel_atomicrmw: @@ -9902,7 +10263,9 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_acq_rel_atomicrmw: @@ -9922,7 +10285,9 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_singlethread_one_as_acq_rel_atomicrmw: @@ -9990,7 +10355,9 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_atomicrmw( ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_singlethread_one_as_seq_cst_atomicrmw: @@ -10001,7 +10368,9 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_atomicrmw( ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: v_mov_b32_e32 v1, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_singlethread_one_as_seq_cst_atomicrmw: @@ -10032,7 +10401,9 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_seq_cst_atomicrmw: @@ -10042,7 +10413,9 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_seq_cst_atomicrmw: @@ -10062,7 +10435,9 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_singlethread_one_as_seq_cst_atomicrmw: @@ -10131,6 +10506,7 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_ret_atomicrmw( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -10146,6 +10522,7 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_ret_atomicrmw( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -10187,6 +10564,7 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_ret_atomicrmw( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -10201,6 +10579,7 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -10227,6 +10606,7 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_ret_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -10313,7 +10693,9 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_ret_atomicrmw( ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -10328,7 +10710,9 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_ret_atomicrmw( ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -10369,7 +10753,9 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_ret_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -10383,7 +10769,9 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -10409,7 +10797,9 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_ret_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -10496,7 +10886,9 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_ret_atomicrmw( ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -10511,7 +10903,9 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_ret_atomicrmw( ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -10552,7 +10946,9 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_ret_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -10566,7 +10962,9 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -10592,7 +10990,9 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_ret_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -10849,6 +11249,7 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_monotonic_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_singlethread_one_as_acquire_monotonic_cmpxchg: @@ -10862,6 +11263,7 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_monotonic_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_singlethread_one_as_acquire_monotonic_cmpxchg: @@ -10899,6 +11301,7 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_acquire_monotonic_cmpxchg: @@ -10911,6 +11314,7 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_acquire_monotonic_cmpxchg: @@ -10935,6 +11339,7 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_singlethread_one_as_acquire_monotonic_cmpxchg: @@ -11015,6 +11420,7 @@ define amdgpu_kernel void @local_singlethread_one_as_release_monotonic_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX6-NEXT: s_endpgm ; @@ -11028,6 +11434,7 @@ define amdgpu_kernel void @local_singlethread_one_as_release_monotonic_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX7-NEXT: s_endpgm ; @@ -11065,6 +11472,7 @@ define amdgpu_kernel void @local_singlethread_one_as_release_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -11077,6 +11485,7 @@ define amdgpu_kernel void @local_singlethread_one_as_release_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -11101,6 +11510,7 @@ define amdgpu_kernel void @local_singlethread_one_as_release_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -11182,7 +11592,9 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_monotonic_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_singlethread_one_as_acq_rel_monotonic_cmpxchg: @@ -11195,7 +11607,9 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_monotonic_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_singlethread_one_as_acq_rel_monotonic_cmpxchg: @@ -11232,7 +11646,9 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_acq_rel_monotonic_cmpxchg: @@ -11244,7 +11660,9 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_acq_rel_monotonic_cmpxchg: @@ -11268,7 +11686,9 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_singlethread_one_as_acq_rel_monotonic_cmpxchg: @@ -11349,7 +11769,9 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_monotonic_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_singlethread_one_as_seq_cst_monotonic_cmpxchg: @@ -11362,7 +11784,9 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_monotonic_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_singlethread_one_as_seq_cst_monotonic_cmpxchg: @@ -11399,7 +11823,9 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_seq_cst_monotonic_cmpxchg: @@ -11411,7 +11837,9 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_seq_cst_monotonic_cmpxchg: @@ -11435,7 +11863,9 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_singlethread_one_as_seq_cst_monotonic_cmpxchg: @@ -11517,6 +11947,7 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_acquire_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_singlethread_one_as_monotonic_acquire_cmpxchg: @@ -11530,6 +11961,7 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_acquire_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_singlethread_one_as_monotonic_acquire_cmpxchg: @@ -11567,6 +11999,7 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_monotonic_acquire_cmpxchg: @@ -11579,6 +12012,7 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_monotonic_acquire_cmpxchg: @@ -11603,6 +12037,7 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_singlethread_one_as_monotonic_acquire_cmpxchg: @@ -11684,6 +12119,7 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_acquire_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_singlethread_one_as_acquire_acquire_cmpxchg: @@ -11697,6 +12133,7 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_acquire_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_singlethread_one_as_acquire_acquire_cmpxchg: @@ -11734,6 +12171,7 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_acquire_acquire_cmpxchg: @@ -11746,6 +12184,7 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_acquire_acquire_cmpxchg: @@ -11770,6 +12209,7 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_singlethread_one_as_acquire_acquire_cmpxchg: @@ -11850,7 +12290,9 @@ define amdgpu_kernel void @local_singlethread_one_as_release_acquire_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_singlethread_one_as_release_acquire_cmpxchg: @@ -11863,7 +12305,9 @@ define amdgpu_kernel void @local_singlethread_one_as_release_acquire_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_singlethread_one_as_release_acquire_cmpxchg: @@ -11900,7 +12344,9 @@ define amdgpu_kernel void @local_singlethread_one_as_release_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_release_acquire_cmpxchg: @@ -11912,7 +12358,9 @@ define amdgpu_kernel void @local_singlethread_one_as_release_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_release_acquire_cmpxchg: @@ -11936,7 +12384,9 @@ define amdgpu_kernel void @local_singlethread_one_as_release_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_singlethread_one_as_release_acquire_cmpxchg: @@ -12017,7 +12467,9 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_acquire_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_singlethread_one_as_acq_rel_acquire_cmpxchg: @@ -12030,7 +12482,9 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_acquire_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_singlethread_one_as_acq_rel_acquire_cmpxchg: @@ -12067,7 +12521,9 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_acq_rel_acquire_cmpxchg: @@ -12079,7 +12535,9 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_acq_rel_acquire_cmpxchg: @@ -12103,7 +12561,9 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_singlethread_one_as_acq_rel_acquire_cmpxchg: @@ -12184,7 +12644,9 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_acquire_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_singlethread_one_as_seq_cst_acquire_cmpxchg: @@ -12197,7 +12659,9 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_acquire_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_singlethread_one_as_seq_cst_acquire_cmpxchg: @@ -12234,7 +12698,9 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_seq_cst_acquire_cmpxchg: @@ -12246,7 +12712,9 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_seq_cst_acquire_cmpxchg: @@ -12270,7 +12738,9 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_singlethread_one_as_seq_cst_acquire_cmpxchg: @@ -12351,7 +12821,9 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_seq_cst_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_singlethread_one_as_monotonic_seq_cst_cmpxchg: @@ -12364,7 +12836,9 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_singlethread_one_as_monotonic_seq_cst_cmpxchg: @@ -12401,7 +12875,9 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_monotonic_seq_cst_cmpxchg: @@ -12413,7 +12889,9 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_monotonic_seq_cst_cmpxchg: @@ -12437,7 +12915,9 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_singlethread_one_as_monotonic_seq_cst_cmpxchg: @@ -12518,7 +12998,9 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_seq_cst_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_singlethread_one_as_acquire_seq_cst_cmpxchg: @@ -12531,7 +13013,9 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_singlethread_one_as_acquire_seq_cst_cmpxchg: @@ -12568,7 +13052,9 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_acquire_seq_cst_cmpxchg: @@ -12580,7 +13066,9 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_acquire_seq_cst_cmpxchg: @@ -12604,7 +13092,9 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_singlethread_one_as_acquire_seq_cst_cmpxchg: @@ -12685,7 +13175,9 @@ define amdgpu_kernel void @local_singlethread_one_as_release_seq_cst_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_singlethread_one_as_release_seq_cst_cmpxchg: @@ -12698,7 +13190,9 @@ define amdgpu_kernel void @local_singlethread_one_as_release_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_singlethread_one_as_release_seq_cst_cmpxchg: @@ -12735,7 +13229,9 @@ define amdgpu_kernel void @local_singlethread_one_as_release_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_release_seq_cst_cmpxchg: @@ -12747,7 +13243,9 @@ define amdgpu_kernel void @local_singlethread_one_as_release_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_release_seq_cst_cmpxchg: @@ -12771,7 +13269,9 @@ define amdgpu_kernel void @local_singlethread_one_as_release_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_singlethread_one_as_release_seq_cst_cmpxchg: @@ -12852,7 +13352,9 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_seq_cst_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_singlethread_one_as_acq_rel_seq_cst_cmpxchg: @@ -12865,7 +13367,9 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_singlethread_one_as_acq_rel_seq_cst_cmpxchg: @@ -12902,7 +13406,9 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_acq_rel_seq_cst_cmpxchg: @@ -12914,7 +13420,9 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_acq_rel_seq_cst_cmpxchg: @@ -12938,7 +13446,9 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_singlethread_one_as_acq_rel_seq_cst_cmpxchg: @@ -13019,7 +13529,9 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_seq_cst_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_singlethread_one_as_seq_cst_seq_cst_cmpxchg: @@ -13032,7 +13544,9 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_singlethread_one_as_seq_cst_seq_cst_cmpxchg: @@ -13069,7 +13583,9 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_seq_cst_seq_cst_cmpxchg: @@ -13081,7 +13597,9 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_seq_cst_seq_cst_cmpxchg: @@ -13105,7 +13623,9 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_singlethread_one_as_seq_cst_seq_cst_cmpxchg: @@ -13398,6 +13918,7 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_monotonic_ret_cmpxc ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -13415,6 +13936,7 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_monotonic_ret_cmpxc ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -13462,6 +13984,7 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_monotonic_ret_cmpxc ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -13478,6 +14001,7 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_monotonic_ret_cmpxc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -13508,6 +14032,7 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_monotonic_ret_cmpxc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -13608,6 +14133,7 @@ define amdgpu_kernel void @local_singlethread_one_as_release_monotonic_ret_cmpxc ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 @@ -13625,6 +14151,7 @@ define amdgpu_kernel void @local_singlethread_one_as_release_monotonic_ret_cmpxc ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -13672,6 +14199,7 @@ define amdgpu_kernel void @local_singlethread_one_as_release_monotonic_ret_cmpxc ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 @@ -13688,6 +14216,7 @@ define amdgpu_kernel void @local_singlethread_one_as_release_monotonic_ret_cmpxc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -13718,6 +14247,7 @@ define amdgpu_kernel void @local_singlethread_one_as_release_monotonic_ret_cmpxc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -13819,7 +14349,9 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_monotonic_ret_cmpxc ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -13836,7 +14368,9 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_monotonic_ret_cmpxc ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -13883,7 +14417,9 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_monotonic_ret_cmpxc ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -13899,7 +14435,9 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_monotonic_ret_cmpxc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -13929,7 +14467,9 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_monotonic_ret_cmpxc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -14030,7 +14570,9 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_monotonic_ret_cmpxc ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -14047,7 +14589,9 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_monotonic_ret_cmpxc ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -14094,7 +14638,9 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_monotonic_ret_cmpxc ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -14110,7 +14656,9 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_monotonic_ret_cmpxc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -14140,7 +14688,9 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_monotonic_ret_cmpxc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -14242,6 +14792,7 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_acquire_ret_cmpxc ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -14259,6 +14810,7 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_acquire_ret_cmpxc ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -14306,6 +14858,7 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_acquire_ret_cmpxc ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -14322,6 +14875,7 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_acquire_ret_cmpxc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -14352,6 +14906,7 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_acquire_ret_cmpxc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -14453,6 +15008,7 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_acquire_ret_cmpxchg ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -14470,6 +15026,7 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_acquire_ret_cmpxchg ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -14517,6 +15074,7 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_acquire_ret_cmpxchg ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -14533,6 +15091,7 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_acquire_ret_cmpxchg ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -14563,6 +15122,7 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_acquire_ret_cmpxchg ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -14663,7 +15223,9 @@ define amdgpu_kernel void @local_singlethread_one_as_release_acquire_ret_cmpxchg ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -14680,7 +15242,9 @@ define amdgpu_kernel void @local_singlethread_one_as_release_acquire_ret_cmpxchg ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -14727,7 +15291,9 @@ define amdgpu_kernel void @local_singlethread_one_as_release_acquire_ret_cmpxchg ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -14743,7 +15309,9 @@ define amdgpu_kernel void @local_singlethread_one_as_release_acquire_ret_cmpxchg ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -14773,7 +15341,9 @@ define amdgpu_kernel void @local_singlethread_one_as_release_acquire_ret_cmpxchg ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -14874,7 +15444,9 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_acquire_ret_cmpxchg ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -14891,7 +15463,9 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_acquire_ret_cmpxchg ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -14938,7 +15512,9 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_acquire_ret_cmpxchg ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -14954,7 +15530,9 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_acquire_ret_cmpxchg ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -14984,7 +15562,9 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_acquire_ret_cmpxchg ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -15085,7 +15665,9 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_acquire_ret_cmpxchg ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -15102,7 +15684,9 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_acquire_ret_cmpxchg ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -15149,7 +15733,9 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_acquire_ret_cmpxchg ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -15165,7 +15751,9 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_acquire_ret_cmpxchg ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -15195,7 +15783,9 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_acquire_ret_cmpxchg ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -15296,7 +15886,9 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_seq_cst_ret_cmpxc ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -15313,7 +15905,9 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_seq_cst_ret_cmpxc ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -15360,7 +15954,9 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_seq_cst_ret_cmpxc ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -15376,7 +15972,9 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_seq_cst_ret_cmpxc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -15406,7 +16004,9 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_seq_cst_ret_cmpxc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -15507,7 +16107,9 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_seq_cst_ret_cmpxchg ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -15524,7 +16126,9 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_seq_cst_ret_cmpxchg ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -15571,7 +16175,9 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_seq_cst_ret_cmpxchg ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -15587,7 +16193,9 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_seq_cst_ret_cmpxchg ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -15617,7 +16225,9 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_seq_cst_ret_cmpxchg ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -15718,7 +16328,9 @@ define amdgpu_kernel void @local_singlethread_one_as_release_seq_cst_ret_cmpxchg ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -15735,7 +16347,9 @@ define amdgpu_kernel void @local_singlethread_one_as_release_seq_cst_ret_cmpxchg ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -15782,7 +16396,9 @@ define amdgpu_kernel void @local_singlethread_one_as_release_seq_cst_ret_cmpxchg ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -15798,7 +16414,9 @@ define amdgpu_kernel void @local_singlethread_one_as_release_seq_cst_ret_cmpxchg ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -15828,7 +16446,9 @@ define amdgpu_kernel void @local_singlethread_one_as_release_seq_cst_ret_cmpxchg ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -15929,7 +16549,9 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -15946,7 +16568,9 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -15993,7 +16617,9 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -16009,7 +16635,9 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -16039,7 +16667,9 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -16140,7 +16770,9 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -16157,7 +16789,9 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -16204,7 +16838,9 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -16220,7 +16856,9 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -16250,7 +16888,9 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-system.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-system.ll index 74a297241d851..7b8493960a629 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-system.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-system.ll @@ -366,7 +366,7 @@ define amdgpu_kernel void @local_system_acquire_load( ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: ds_read_b32 v1, v0 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: ds_write_b32 v0, v1 @@ -380,7 +380,7 @@ define amdgpu_kernel void @local_system_acquire_load( ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: ds_read_b32 v1, v0 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: ds_write_b32 v0, v1 @@ -419,7 +419,7 @@ define amdgpu_kernel void @local_system_acquire_load( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: ds_read_b32 v1, v0 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 @@ -432,7 +432,7 @@ define amdgpu_kernel void @local_system_acquire_load( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: ds_read_b32 v1, v0 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm @@ -457,7 +457,7 @@ define amdgpu_kernel void @local_system_acquire_load( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: ds_read_b32 v1, v0 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm @@ -541,9 +541,9 @@ define amdgpu_kernel void @local_system_seq_cst_load( ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: ds_read_b32 v1, v0 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: ds_write_b32 v0, v1 @@ -556,9 +556,9 @@ define amdgpu_kernel void @local_system_seq_cst_load( ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: ds_read_b32 v1, v0 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: ds_write_b32 v0, v1 @@ -599,9 +599,9 @@ define amdgpu_kernel void @local_system_seq_cst_load( ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_read_b32 v1, v0 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 @@ -613,9 +613,9 @@ define amdgpu_kernel void @local_system_seq_cst_load( ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_read_b32 v1, v0 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm @@ -640,9 +640,9 @@ define amdgpu_kernel void @local_system_seq_cst_load( ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_read_b32 v1, v0 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm @@ -1015,7 +1015,7 @@ define amdgpu_kernel void @local_system_release_store( ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm ; @@ -1027,7 +1027,7 @@ define amdgpu_kernel void @local_system_release_store( ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: v_mov_b32_e32 v1, s4 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: ds_write_b32 v0, v1 ; GFX7-NEXT: s_endpgm ; @@ -1062,7 +1062,7 @@ define amdgpu_kernel void @local_system_release_store( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -1073,7 +1073,7 @@ define amdgpu_kernel void @local_system_release_store( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -1095,7 +1095,7 @@ define amdgpu_kernel void @local_system_release_store( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -1173,7 +1173,7 @@ define amdgpu_kernel void @local_system_seq_cst_store( ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm ; @@ -1185,7 +1185,7 @@ define amdgpu_kernel void @local_system_seq_cst_store( ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: v_mov_b32_e32 v1, s4 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: ds_write_b32 v0, v1 ; GFX7-NEXT: s_endpgm ; @@ -1220,7 +1220,7 @@ define amdgpu_kernel void @local_system_seq_cst_store( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -1231,7 +1231,7 @@ define amdgpu_kernel void @local_system_seq_cst_store( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -1253,7 +1253,7 @@ define amdgpu_kernel void @local_system_seq_cst_store( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -1472,7 +1472,7 @@ define amdgpu_kernel void @local_system_acquire_atomicrmw( ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_system_acquire_atomicrmw: @@ -1484,7 +1484,7 @@ define amdgpu_kernel void @local_system_acquire_atomicrmw( ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: v_mov_b32_e32 v1, s4 ; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_system_acquire_atomicrmw: @@ -1519,7 +1519,7 @@ define amdgpu_kernel void @local_system_acquire_atomicrmw( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_system_acquire_atomicrmw: @@ -1530,7 +1530,7 @@ define amdgpu_kernel void @local_system_acquire_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_system_acquire_atomicrmw: @@ -1552,7 +1552,7 @@ define amdgpu_kernel void @local_system_acquire_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_system_acquire_atomicrmw: @@ -1627,7 +1627,7 @@ define amdgpu_kernel void @local_system_release_atomicrmw( ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX6-NEXT: s_endpgm ; @@ -1639,7 +1639,7 @@ define amdgpu_kernel void @local_system_release_atomicrmw( ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: v_mov_b32_e32 v1, s4 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX7-NEXT: s_endpgm ; @@ -1674,7 +1674,7 @@ define amdgpu_kernel void @local_system_release_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -1685,7 +1685,7 @@ define amdgpu_kernel void @local_system_release_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -1707,7 +1707,7 @@ define amdgpu_kernel void @local_system_release_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -1785,9 +1785,9 @@ define amdgpu_kernel void @local_system_acq_rel_atomicrmw( ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_system_acq_rel_atomicrmw: @@ -1798,9 +1798,9 @@ define amdgpu_kernel void @local_system_acq_rel_atomicrmw( ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: v_mov_b32_e32 v1, s4 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_system_acq_rel_atomicrmw: @@ -1837,9 +1837,9 @@ define amdgpu_kernel void @local_system_acq_rel_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_system_acq_rel_atomicrmw: @@ -1849,9 +1849,9 @@ define amdgpu_kernel void @local_system_acq_rel_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_system_acq_rel_atomicrmw: @@ -1873,9 +1873,9 @@ define amdgpu_kernel void @local_system_acq_rel_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_system_acq_rel_atomicrmw: @@ -1959,9 +1959,9 @@ define amdgpu_kernel void @local_system_seq_cst_atomicrmw( ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_system_seq_cst_atomicrmw: @@ -1972,9 +1972,9 @@ define amdgpu_kernel void @local_system_seq_cst_atomicrmw( ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: v_mov_b32_e32 v1, s4 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_system_seq_cst_atomicrmw: @@ -2011,9 +2011,9 @@ define amdgpu_kernel void @local_system_seq_cst_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_system_seq_cst_atomicrmw: @@ -2023,9 +2023,9 @@ define amdgpu_kernel void @local_system_seq_cst_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_system_seq_cst_atomicrmw: @@ -2047,9 +2047,9 @@ define amdgpu_kernel void @local_system_seq_cst_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_system_seq_cst_atomicrmw: @@ -2134,7 +2134,7 @@ define amdgpu_kernel void @local_system_acquire_ret_atomicrmw( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: ds_write_b32 v0, v1 @@ -2149,7 +2149,7 @@ define amdgpu_kernel void @local_system_acquire_ret_atomicrmw( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: ds_write_b32 v0, v1 @@ -2191,7 +2191,7 @@ define amdgpu_kernel void @local_system_acquire_ret_atomicrmw( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 @@ -2205,7 +2205,7 @@ define amdgpu_kernel void @local_system_acquire_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm @@ -2232,7 +2232,7 @@ define amdgpu_kernel void @local_system_acquire_ret_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm @@ -2321,9 +2321,9 @@ define amdgpu_kernel void @local_system_acq_rel_ret_atomicrmw( ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: ds_write_b32 v0, v1 @@ -2337,9 +2337,9 @@ define amdgpu_kernel void @local_system_acq_rel_ret_atomicrmw( ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: ds_write_b32 v0, v1 @@ -2383,9 +2383,9 @@ define amdgpu_kernel void @local_system_acq_rel_ret_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 @@ -2398,9 +2398,9 @@ define amdgpu_kernel void @local_system_acq_rel_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm @@ -2427,9 +2427,9 @@ define amdgpu_kernel void @local_system_acq_rel_ret_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm @@ -2527,9 +2527,9 @@ define amdgpu_kernel void @local_system_seq_cst_ret_atomicrmw( ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: ds_write_b32 v0, v1 @@ -2543,9 +2543,9 @@ define amdgpu_kernel void @local_system_seq_cst_ret_atomicrmw( ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: ds_write_b32 v0, v1 @@ -2589,9 +2589,9 @@ define amdgpu_kernel void @local_system_seq_cst_ret_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 @@ -2604,9 +2604,9 @@ define amdgpu_kernel void @local_system_seq_cst_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm @@ -2633,9 +2633,9 @@ define amdgpu_kernel void @local_system_seq_cst_ret_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm @@ -2903,7 +2903,7 @@ define amdgpu_kernel void @local_system_acquire_monotonic_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_system_acquire_monotonic_cmpxchg: @@ -2917,7 +2917,7 @@ define amdgpu_kernel void @local_system_acquire_monotonic_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_system_acquire_monotonic_cmpxchg: @@ -2958,7 +2958,7 @@ define amdgpu_kernel void @local_system_acquire_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_system_acquire_monotonic_cmpxchg: @@ -2971,7 +2971,7 @@ define amdgpu_kernel void @local_system_acquire_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_system_acquire_monotonic_cmpxchg: @@ -2997,7 +2997,7 @@ define amdgpu_kernel void @local_system_acquire_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_system_acquire_monotonic_cmpxchg: @@ -3085,7 +3085,7 @@ define amdgpu_kernel void @local_system_release_monotonic_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX6-NEXT: s_endpgm ; @@ -3099,7 +3099,7 @@ define amdgpu_kernel void @local_system_release_monotonic_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX7-NEXT: s_endpgm ; @@ -3140,7 +3140,7 @@ define amdgpu_kernel void @local_system_release_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -3153,7 +3153,7 @@ define amdgpu_kernel void @local_system_release_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -3179,7 +3179,7 @@ define amdgpu_kernel void @local_system_release_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -3270,9 +3270,9 @@ define amdgpu_kernel void @local_system_acq_rel_monotonic_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_system_acq_rel_monotonic_cmpxchg: @@ -3285,9 +3285,9 @@ define amdgpu_kernel void @local_system_acq_rel_monotonic_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_system_acq_rel_monotonic_cmpxchg: @@ -3330,9 +3330,9 @@ define amdgpu_kernel void @local_system_acq_rel_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_system_acq_rel_monotonic_cmpxchg: @@ -3344,9 +3344,9 @@ define amdgpu_kernel void @local_system_acq_rel_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_system_acq_rel_monotonic_cmpxchg: @@ -3372,9 +3372,9 @@ define amdgpu_kernel void @local_system_acq_rel_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_system_acq_rel_monotonic_cmpxchg: @@ -3471,9 +3471,9 @@ define amdgpu_kernel void @local_system_seq_cst_monotonic_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_system_seq_cst_monotonic_cmpxchg: @@ -3486,9 +3486,9 @@ define amdgpu_kernel void @local_system_seq_cst_monotonic_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_system_seq_cst_monotonic_cmpxchg: @@ -3531,9 +3531,9 @@ define amdgpu_kernel void @local_system_seq_cst_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_system_seq_cst_monotonic_cmpxchg: @@ -3545,9 +3545,9 @@ define amdgpu_kernel void @local_system_seq_cst_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_system_seq_cst_monotonic_cmpxchg: @@ -3573,9 +3573,9 @@ define amdgpu_kernel void @local_system_seq_cst_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_system_seq_cst_monotonic_cmpxchg: @@ -3673,7 +3673,7 @@ define amdgpu_kernel void @local_system_monotonic_acquire_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_system_monotonic_acquire_cmpxchg: @@ -3687,7 +3687,7 @@ define amdgpu_kernel void @local_system_monotonic_acquire_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_system_monotonic_acquire_cmpxchg: @@ -3728,7 +3728,7 @@ define amdgpu_kernel void @local_system_monotonic_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_system_monotonic_acquire_cmpxchg: @@ -3741,7 +3741,7 @@ define amdgpu_kernel void @local_system_monotonic_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_system_monotonic_acquire_cmpxchg: @@ -3767,7 +3767,7 @@ define amdgpu_kernel void @local_system_monotonic_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_system_monotonic_acquire_cmpxchg: @@ -3856,7 +3856,7 @@ define amdgpu_kernel void @local_system_acquire_acquire_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_system_acquire_acquire_cmpxchg: @@ -3870,7 +3870,7 @@ define amdgpu_kernel void @local_system_acquire_acquire_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_system_acquire_acquire_cmpxchg: @@ -3911,7 +3911,7 @@ define amdgpu_kernel void @local_system_acquire_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_system_acquire_acquire_cmpxchg: @@ -3924,7 +3924,7 @@ define amdgpu_kernel void @local_system_acquire_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_system_acquire_acquire_cmpxchg: @@ -3950,7 +3950,7 @@ define amdgpu_kernel void @local_system_acquire_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_system_acquire_acquire_cmpxchg: @@ -4038,9 +4038,9 @@ define amdgpu_kernel void @local_system_release_acquire_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_system_release_acquire_cmpxchg: @@ -4053,9 +4053,9 @@ define amdgpu_kernel void @local_system_release_acquire_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_system_release_acquire_cmpxchg: @@ -4098,9 +4098,9 @@ define amdgpu_kernel void @local_system_release_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_system_release_acquire_cmpxchg: @@ -4112,9 +4112,9 @@ define amdgpu_kernel void @local_system_release_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_system_release_acquire_cmpxchg: @@ -4140,9 +4140,9 @@ define amdgpu_kernel void @local_system_release_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_system_release_acquire_cmpxchg: @@ -4239,9 +4239,9 @@ define amdgpu_kernel void @local_system_acq_rel_acquire_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_system_acq_rel_acquire_cmpxchg: @@ -4254,9 +4254,9 @@ define amdgpu_kernel void @local_system_acq_rel_acquire_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_system_acq_rel_acquire_cmpxchg: @@ -4299,9 +4299,9 @@ define amdgpu_kernel void @local_system_acq_rel_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_system_acq_rel_acquire_cmpxchg: @@ -4313,9 +4313,9 @@ define amdgpu_kernel void @local_system_acq_rel_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_system_acq_rel_acquire_cmpxchg: @@ -4341,9 +4341,9 @@ define amdgpu_kernel void @local_system_acq_rel_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_system_acq_rel_acquire_cmpxchg: @@ -4440,9 +4440,9 @@ define amdgpu_kernel void @local_system_seq_cst_acquire_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_system_seq_cst_acquire_cmpxchg: @@ -4455,9 +4455,9 @@ define amdgpu_kernel void @local_system_seq_cst_acquire_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_system_seq_cst_acquire_cmpxchg: @@ -4500,9 +4500,9 @@ define amdgpu_kernel void @local_system_seq_cst_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_system_seq_cst_acquire_cmpxchg: @@ -4514,9 +4514,9 @@ define amdgpu_kernel void @local_system_seq_cst_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_system_seq_cst_acquire_cmpxchg: @@ -4542,9 +4542,9 @@ define amdgpu_kernel void @local_system_seq_cst_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_system_seq_cst_acquire_cmpxchg: @@ -4641,9 +4641,9 @@ define amdgpu_kernel void @local_system_monotonic_seq_cst_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_system_monotonic_seq_cst_cmpxchg: @@ -4656,9 +4656,9 @@ define amdgpu_kernel void @local_system_monotonic_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_system_monotonic_seq_cst_cmpxchg: @@ -4701,9 +4701,9 @@ define amdgpu_kernel void @local_system_monotonic_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_system_monotonic_seq_cst_cmpxchg: @@ -4715,9 +4715,9 @@ define amdgpu_kernel void @local_system_monotonic_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_system_monotonic_seq_cst_cmpxchg: @@ -4743,9 +4743,9 @@ define amdgpu_kernel void @local_system_monotonic_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_system_monotonic_seq_cst_cmpxchg: @@ -4842,9 +4842,9 @@ define amdgpu_kernel void @local_system_acquire_seq_cst_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_system_acquire_seq_cst_cmpxchg: @@ -4857,9 +4857,9 @@ define amdgpu_kernel void @local_system_acquire_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_system_acquire_seq_cst_cmpxchg: @@ -4902,9 +4902,9 @@ define amdgpu_kernel void @local_system_acquire_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_system_acquire_seq_cst_cmpxchg: @@ -4916,9 +4916,9 @@ define amdgpu_kernel void @local_system_acquire_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_system_acquire_seq_cst_cmpxchg: @@ -4944,9 +4944,9 @@ define amdgpu_kernel void @local_system_acquire_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_system_acquire_seq_cst_cmpxchg: @@ -5043,9 +5043,9 @@ define amdgpu_kernel void @local_system_release_seq_cst_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_system_release_seq_cst_cmpxchg: @@ -5058,9 +5058,9 @@ define amdgpu_kernel void @local_system_release_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_system_release_seq_cst_cmpxchg: @@ -5103,9 +5103,9 @@ define amdgpu_kernel void @local_system_release_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_system_release_seq_cst_cmpxchg: @@ -5117,9 +5117,9 @@ define amdgpu_kernel void @local_system_release_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_system_release_seq_cst_cmpxchg: @@ -5145,9 +5145,9 @@ define amdgpu_kernel void @local_system_release_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_system_release_seq_cst_cmpxchg: @@ -5244,9 +5244,9 @@ define amdgpu_kernel void @local_system_acq_rel_seq_cst_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_system_acq_rel_seq_cst_cmpxchg: @@ -5259,9 +5259,9 @@ define amdgpu_kernel void @local_system_acq_rel_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_system_acq_rel_seq_cst_cmpxchg: @@ -5304,9 +5304,9 @@ define amdgpu_kernel void @local_system_acq_rel_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_system_acq_rel_seq_cst_cmpxchg: @@ -5318,9 +5318,9 @@ define amdgpu_kernel void @local_system_acq_rel_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_system_acq_rel_seq_cst_cmpxchg: @@ -5346,9 +5346,9 @@ define amdgpu_kernel void @local_system_acq_rel_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_system_acq_rel_seq_cst_cmpxchg: @@ -5445,9 +5445,9 @@ define amdgpu_kernel void @local_system_seq_cst_seq_cst_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_system_seq_cst_seq_cst_cmpxchg: @@ -5460,9 +5460,9 @@ define amdgpu_kernel void @local_system_seq_cst_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_system_seq_cst_seq_cst_cmpxchg: @@ -5505,9 +5505,9 @@ define amdgpu_kernel void @local_system_seq_cst_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_system_seq_cst_seq_cst_cmpxchg: @@ -5519,9 +5519,9 @@ define amdgpu_kernel void @local_system_seq_cst_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_system_seq_cst_seq_cst_cmpxchg: @@ -5547,9 +5547,9 @@ define amdgpu_kernel void @local_system_seq_cst_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_system_seq_cst_seq_cst_cmpxchg: @@ -5858,7 +5858,7 @@ define amdgpu_kernel void @local_system_acquire_monotonic_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: ds_write_b32 v0, v1 @@ -5875,7 +5875,7 @@ define amdgpu_kernel void @local_system_acquire_monotonic_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: ds_write_b32 v0, v1 @@ -5923,7 +5923,7 @@ define amdgpu_kernel void @local_system_acquire_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 @@ -5939,7 +5939,7 @@ define amdgpu_kernel void @local_system_acquire_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm @@ -5970,7 +5970,7 @@ define amdgpu_kernel void @local_system_acquire_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm @@ -6073,7 +6073,7 @@ define amdgpu_kernel void @local_system_release_monotonic_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 @@ -6091,7 +6091,7 @@ define amdgpu_kernel void @local_system_release_monotonic_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -6142,7 +6142,7 @@ define amdgpu_kernel void @local_system_release_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 @@ -6159,7 +6159,7 @@ define amdgpu_kernel void @local_system_release_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -6191,7 +6191,7 @@ define amdgpu_kernel void @local_system_release_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -6302,9 +6302,9 @@ define amdgpu_kernel void @local_system_acq_rel_monotonic_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: ds_write_b32 v0, v1 @@ -6320,9 +6320,9 @@ define amdgpu_kernel void @local_system_acq_rel_monotonic_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: ds_write_b32 v0, v1 @@ -6372,9 +6372,9 @@ define amdgpu_kernel void @local_system_acq_rel_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 @@ -6389,9 +6389,9 @@ define amdgpu_kernel void @local_system_acq_rel_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm @@ -6422,9 +6422,9 @@ define amdgpu_kernel void @local_system_acq_rel_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm @@ -6536,9 +6536,9 @@ define amdgpu_kernel void @local_system_seq_cst_monotonic_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: ds_write_b32 v0, v1 @@ -6554,9 +6554,9 @@ define amdgpu_kernel void @local_system_seq_cst_monotonic_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: ds_write_b32 v0, v1 @@ -6606,9 +6606,9 @@ define amdgpu_kernel void @local_system_seq_cst_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 @@ -6623,9 +6623,9 @@ define amdgpu_kernel void @local_system_seq_cst_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm @@ -6656,9 +6656,9 @@ define amdgpu_kernel void @local_system_seq_cst_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm @@ -6771,7 +6771,7 @@ define amdgpu_kernel void @local_system_monotonic_acquire_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: ds_write_b32 v0, v1 @@ -6788,7 +6788,7 @@ define amdgpu_kernel void @local_system_monotonic_acquire_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: ds_write_b32 v0, v1 @@ -6836,7 +6836,7 @@ define amdgpu_kernel void @local_system_monotonic_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 @@ -6852,7 +6852,7 @@ define amdgpu_kernel void @local_system_monotonic_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm @@ -6883,7 +6883,7 @@ define amdgpu_kernel void @local_system_monotonic_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm @@ -6987,7 +6987,7 @@ define amdgpu_kernel void @local_system_acquire_acquire_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: ds_write_b32 v0, v1 @@ -7004,7 +7004,7 @@ define amdgpu_kernel void @local_system_acquire_acquire_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: ds_write_b32 v0, v1 @@ -7052,7 +7052,7 @@ define amdgpu_kernel void @local_system_acquire_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 @@ -7068,7 +7068,7 @@ define amdgpu_kernel void @local_system_acquire_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm @@ -7099,7 +7099,7 @@ define amdgpu_kernel void @local_system_acquire_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm @@ -7202,9 +7202,9 @@ define amdgpu_kernel void @local_system_release_acquire_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: ds_write_b32 v0, v1 @@ -7220,9 +7220,9 @@ define amdgpu_kernel void @local_system_release_acquire_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: ds_write_b32 v0, v1 @@ -7272,9 +7272,9 @@ define amdgpu_kernel void @local_system_release_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 @@ -7289,9 +7289,9 @@ define amdgpu_kernel void @local_system_release_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm @@ -7322,9 +7322,9 @@ define amdgpu_kernel void @local_system_release_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm @@ -7436,9 +7436,9 @@ define amdgpu_kernel void @local_system_acq_rel_acquire_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: ds_write_b32 v0, v1 @@ -7454,9 +7454,9 @@ define amdgpu_kernel void @local_system_acq_rel_acquire_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: ds_write_b32 v0, v1 @@ -7506,9 +7506,9 @@ define amdgpu_kernel void @local_system_acq_rel_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 @@ -7523,9 +7523,9 @@ define amdgpu_kernel void @local_system_acq_rel_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm @@ -7556,9 +7556,9 @@ define amdgpu_kernel void @local_system_acq_rel_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm @@ -7670,9 +7670,9 @@ define amdgpu_kernel void @local_system_seq_cst_acquire_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: ds_write_b32 v0, v1 @@ -7688,9 +7688,9 @@ define amdgpu_kernel void @local_system_seq_cst_acquire_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: ds_write_b32 v0, v1 @@ -7740,9 +7740,9 @@ define amdgpu_kernel void @local_system_seq_cst_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 @@ -7757,9 +7757,9 @@ define amdgpu_kernel void @local_system_seq_cst_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm @@ -7790,9 +7790,9 @@ define amdgpu_kernel void @local_system_seq_cst_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm @@ -7904,9 +7904,9 @@ define amdgpu_kernel void @local_system_monotonic_seq_cst_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: ds_write_b32 v0, v1 @@ -7922,9 +7922,9 @@ define amdgpu_kernel void @local_system_monotonic_seq_cst_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: ds_write_b32 v0, v1 @@ -7974,9 +7974,9 @@ define amdgpu_kernel void @local_system_monotonic_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 @@ -7991,9 +7991,9 @@ define amdgpu_kernel void @local_system_monotonic_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm @@ -8024,9 +8024,9 @@ define amdgpu_kernel void @local_system_monotonic_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm @@ -8138,9 +8138,9 @@ define amdgpu_kernel void @local_system_acquire_seq_cst_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: ds_write_b32 v0, v1 @@ -8156,9 +8156,9 @@ define amdgpu_kernel void @local_system_acquire_seq_cst_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: ds_write_b32 v0, v1 @@ -8208,9 +8208,9 @@ define amdgpu_kernel void @local_system_acquire_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 @@ -8225,9 +8225,9 @@ define amdgpu_kernel void @local_system_acquire_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm @@ -8258,9 +8258,9 @@ define amdgpu_kernel void @local_system_acquire_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm @@ -8372,9 +8372,9 @@ define amdgpu_kernel void @local_system_release_seq_cst_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: ds_write_b32 v0, v1 @@ -8390,9 +8390,9 @@ define amdgpu_kernel void @local_system_release_seq_cst_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: ds_write_b32 v0, v1 @@ -8442,9 +8442,9 @@ define amdgpu_kernel void @local_system_release_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 @@ -8459,9 +8459,9 @@ define amdgpu_kernel void @local_system_release_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm @@ -8492,9 +8492,9 @@ define amdgpu_kernel void @local_system_release_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm @@ -8606,9 +8606,9 @@ define amdgpu_kernel void @local_system_acq_rel_seq_cst_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: ds_write_b32 v0, v1 @@ -8624,9 +8624,9 @@ define amdgpu_kernel void @local_system_acq_rel_seq_cst_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: ds_write_b32 v0, v1 @@ -8676,9 +8676,9 @@ define amdgpu_kernel void @local_system_acq_rel_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 @@ -8693,9 +8693,9 @@ define amdgpu_kernel void @local_system_acq_rel_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm @@ -8726,9 +8726,9 @@ define amdgpu_kernel void @local_system_acq_rel_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm @@ -8840,9 +8840,9 @@ define amdgpu_kernel void @local_system_seq_cst_seq_cst_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: ds_write_b32 v0, v1 @@ -8858,9 +8858,9 @@ define amdgpu_kernel void @local_system_seq_cst_seq_cst_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: ds_write_b32 v0, v1 @@ -8910,9 +8910,9 @@ define amdgpu_kernel void @local_system_seq_cst_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 @@ -8927,9 +8927,9 @@ define amdgpu_kernel void @local_system_seq_cst_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm @@ -8960,9 +8960,9 @@ define amdgpu_kernel void @local_system_seq_cst_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm @@ -9415,6 +9415,7 @@ define amdgpu_kernel void @local_system_one_as_acquire_load( ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: ds_read_b32 v1, v0 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -9429,6 +9430,7 @@ define amdgpu_kernel void @local_system_one_as_acquire_load( ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: ds_read_b32 v1, v0 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -9467,6 +9469,7 @@ define amdgpu_kernel void @local_system_one_as_acquire_load( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: ds_read_b32 v1, v0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -9480,6 +9483,7 @@ define amdgpu_kernel void @local_system_one_as_acquire_load( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: ds_read_b32 v1, v0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -9504,6 +9508,7 @@ define amdgpu_kernel void @local_system_one_as_acquire_load( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: ds_read_b32 v1, v0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -9585,7 +9590,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_load( ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_read_b32 v1, v0 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -9599,7 +9606,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_load( ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_read_b32 v1, v0 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -9637,7 +9646,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_load( ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_read_b32 v1, v0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -9650,7 +9661,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_load( ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_read_b32 v1, v0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -9674,7 +9687,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_load( ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_read_b32 v1, v0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -10036,6 +10051,7 @@ define amdgpu_kernel void @local_system_one_as_release_store( ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm ; @@ -10047,6 +10063,7 @@ define amdgpu_kernel void @local_system_one_as_release_store( ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: v_mov_b32_e32 v1, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_write_b32 v0, v1 ; GFX7-NEXT: s_endpgm ; @@ -10078,6 +10095,7 @@ define amdgpu_kernel void @local_system_one_as_release_store( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -10088,6 +10106,7 @@ define amdgpu_kernel void @local_system_one_as_release_store( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -10108,6 +10127,7 @@ define amdgpu_kernel void @local_system_one_as_release_store( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -10176,6 +10196,7 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_store( ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm ; @@ -10187,6 +10208,7 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_store( ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: v_mov_b32_e32 v1, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_write_b32 v0, v1 ; GFX7-NEXT: s_endpgm ; @@ -10218,6 +10240,7 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_store( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -10228,6 +10251,7 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_store( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -10248,6 +10272,7 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_store( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -10457,6 +10482,7 @@ define amdgpu_kernel void @local_system_one_as_acquire_atomicrmw( ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_system_one_as_acquire_atomicrmw: @@ -10468,6 +10494,7 @@ define amdgpu_kernel void @local_system_one_as_acquire_atomicrmw( ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: v_mov_b32_e32 v1, s4 ; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_system_one_as_acquire_atomicrmw: @@ -10499,6 +10526,7 @@ define amdgpu_kernel void @local_system_one_as_acquire_atomicrmw( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_acquire_atomicrmw: @@ -10509,6 +10537,7 @@ define amdgpu_kernel void @local_system_one_as_acquire_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_system_one_as_acquire_atomicrmw: @@ -10529,6 +10558,7 @@ define amdgpu_kernel void @local_system_one_as_acquire_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_system_one_as_acquire_atomicrmw: @@ -10596,6 +10626,7 @@ define amdgpu_kernel void @local_system_one_as_release_atomicrmw( ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX6-NEXT: s_endpgm ; @@ -10607,6 +10638,7 @@ define amdgpu_kernel void @local_system_one_as_release_atomicrmw( ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: v_mov_b32_e32 v1, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX7-NEXT: s_endpgm ; @@ -10638,6 +10670,7 @@ define amdgpu_kernel void @local_system_one_as_release_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -10648,6 +10681,7 @@ define amdgpu_kernel void @local_system_one_as_release_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -10668,6 +10702,7 @@ define amdgpu_kernel void @local_system_one_as_release_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -10736,7 +10771,9 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_atomicrmw( ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_system_one_as_acq_rel_atomicrmw: @@ -10747,7 +10784,9 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_atomicrmw( ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: v_mov_b32_e32 v1, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_system_one_as_acq_rel_atomicrmw: @@ -10778,7 +10817,9 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_acq_rel_atomicrmw: @@ -10788,7 +10829,9 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_system_one_as_acq_rel_atomicrmw: @@ -10808,7 +10851,9 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_system_one_as_acq_rel_atomicrmw: @@ -10876,7 +10921,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_atomicrmw( ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_system_one_as_seq_cst_atomicrmw: @@ -10887,7 +10934,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_atomicrmw( ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: v_mov_b32_e32 v1, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_system_one_as_seq_cst_atomicrmw: @@ -10918,7 +10967,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_seq_cst_atomicrmw: @@ -10928,7 +10979,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_system_one_as_seq_cst_atomicrmw: @@ -10948,7 +11001,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_system_one_as_seq_cst_atomicrmw: @@ -11017,6 +11072,7 @@ define amdgpu_kernel void @local_system_one_as_acquire_ret_atomicrmw( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -11032,6 +11088,7 @@ define amdgpu_kernel void @local_system_one_as_acquire_ret_atomicrmw( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -11073,6 +11130,7 @@ define amdgpu_kernel void @local_system_one_as_acquire_ret_atomicrmw( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -11087,6 +11145,7 @@ define amdgpu_kernel void @local_system_one_as_acquire_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -11113,6 +11172,7 @@ define amdgpu_kernel void @local_system_one_as_acquire_ret_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -11199,7 +11259,9 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_ret_atomicrmw( ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -11214,7 +11276,9 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_ret_atomicrmw( ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -11255,7 +11319,9 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_ret_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -11269,7 +11335,9 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -11295,7 +11363,9 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_ret_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -11382,7 +11452,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_ret_atomicrmw( ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -11397,7 +11469,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_ret_atomicrmw( ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -11438,7 +11512,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_ret_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -11452,7 +11528,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -11478,7 +11556,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_ret_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -11735,6 +11815,7 @@ define amdgpu_kernel void @local_system_one_as_acquire_monotonic_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_system_one_as_acquire_monotonic_cmpxchg: @@ -11748,6 +11829,7 @@ define amdgpu_kernel void @local_system_one_as_acquire_monotonic_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_system_one_as_acquire_monotonic_cmpxchg: @@ -11785,6 +11867,7 @@ define amdgpu_kernel void @local_system_one_as_acquire_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_acquire_monotonic_cmpxchg: @@ -11797,6 +11880,7 @@ define amdgpu_kernel void @local_system_one_as_acquire_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_system_one_as_acquire_monotonic_cmpxchg: @@ -11821,6 +11905,7 @@ define amdgpu_kernel void @local_system_one_as_acquire_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_system_one_as_acquire_monotonic_cmpxchg: @@ -11901,6 +11986,7 @@ define amdgpu_kernel void @local_system_one_as_release_monotonic_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX6-NEXT: s_endpgm ; @@ -11914,6 +12000,7 @@ define amdgpu_kernel void @local_system_one_as_release_monotonic_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX7-NEXT: s_endpgm ; @@ -11951,6 +12038,7 @@ define amdgpu_kernel void @local_system_one_as_release_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -11963,6 +12051,7 @@ define amdgpu_kernel void @local_system_one_as_release_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -11987,6 +12076,7 @@ define amdgpu_kernel void @local_system_one_as_release_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -12068,7 +12158,9 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_monotonic_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_system_one_as_acq_rel_monotonic_cmpxchg: @@ -12081,7 +12173,9 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_monotonic_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_system_one_as_acq_rel_monotonic_cmpxchg: @@ -12118,7 +12212,9 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_acq_rel_monotonic_cmpxchg: @@ -12130,7 +12226,9 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_system_one_as_acq_rel_monotonic_cmpxchg: @@ -12154,7 +12252,9 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_system_one_as_acq_rel_monotonic_cmpxchg: @@ -12235,7 +12335,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_monotonic_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_system_one_as_seq_cst_monotonic_cmpxchg: @@ -12248,7 +12350,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_monotonic_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_system_one_as_seq_cst_monotonic_cmpxchg: @@ -12285,7 +12389,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_seq_cst_monotonic_cmpxchg: @@ -12297,7 +12403,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_system_one_as_seq_cst_monotonic_cmpxchg: @@ -12321,7 +12429,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_system_one_as_seq_cst_monotonic_cmpxchg: @@ -12403,6 +12513,7 @@ define amdgpu_kernel void @local_system_one_as_monotonic_acquire_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_system_one_as_monotonic_acquire_cmpxchg: @@ -12416,6 +12527,7 @@ define amdgpu_kernel void @local_system_one_as_monotonic_acquire_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_system_one_as_monotonic_acquire_cmpxchg: @@ -12453,6 +12565,7 @@ define amdgpu_kernel void @local_system_one_as_monotonic_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_monotonic_acquire_cmpxchg: @@ -12465,6 +12578,7 @@ define amdgpu_kernel void @local_system_one_as_monotonic_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_system_one_as_monotonic_acquire_cmpxchg: @@ -12489,6 +12603,7 @@ define amdgpu_kernel void @local_system_one_as_monotonic_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_system_one_as_monotonic_acquire_cmpxchg: @@ -12570,6 +12685,7 @@ define amdgpu_kernel void @local_system_one_as_acquire_acquire_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_system_one_as_acquire_acquire_cmpxchg: @@ -12583,6 +12699,7 @@ define amdgpu_kernel void @local_system_one_as_acquire_acquire_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_system_one_as_acquire_acquire_cmpxchg: @@ -12620,6 +12737,7 @@ define amdgpu_kernel void @local_system_one_as_acquire_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_acquire_acquire_cmpxchg: @@ -12632,6 +12750,7 @@ define amdgpu_kernel void @local_system_one_as_acquire_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_system_one_as_acquire_acquire_cmpxchg: @@ -12656,6 +12775,7 @@ define amdgpu_kernel void @local_system_one_as_acquire_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_system_one_as_acquire_acquire_cmpxchg: @@ -12736,7 +12856,9 @@ define amdgpu_kernel void @local_system_one_as_release_acquire_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_system_one_as_release_acquire_cmpxchg: @@ -12749,7 +12871,9 @@ define amdgpu_kernel void @local_system_one_as_release_acquire_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_system_one_as_release_acquire_cmpxchg: @@ -12786,7 +12910,9 @@ define amdgpu_kernel void @local_system_one_as_release_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_release_acquire_cmpxchg: @@ -12798,7 +12924,9 @@ define amdgpu_kernel void @local_system_one_as_release_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_system_one_as_release_acquire_cmpxchg: @@ -12822,7 +12950,9 @@ define amdgpu_kernel void @local_system_one_as_release_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_system_one_as_release_acquire_cmpxchg: @@ -12903,7 +13033,9 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_acquire_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_system_one_as_acq_rel_acquire_cmpxchg: @@ -12916,7 +13048,9 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_acquire_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_system_one_as_acq_rel_acquire_cmpxchg: @@ -12953,7 +13087,9 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_acq_rel_acquire_cmpxchg: @@ -12965,7 +13101,9 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_system_one_as_acq_rel_acquire_cmpxchg: @@ -12989,7 +13127,9 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_system_one_as_acq_rel_acquire_cmpxchg: @@ -13070,7 +13210,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_acquire_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_system_one_as_seq_cst_acquire_cmpxchg: @@ -13083,7 +13225,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_acquire_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_system_one_as_seq_cst_acquire_cmpxchg: @@ -13120,7 +13264,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_seq_cst_acquire_cmpxchg: @@ -13132,7 +13278,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_system_one_as_seq_cst_acquire_cmpxchg: @@ -13156,7 +13304,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_system_one_as_seq_cst_acquire_cmpxchg: @@ -13237,7 +13387,9 @@ define amdgpu_kernel void @local_system_one_as_monotonic_seq_cst_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_system_one_as_monotonic_seq_cst_cmpxchg: @@ -13250,7 +13402,9 @@ define amdgpu_kernel void @local_system_one_as_monotonic_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_system_one_as_monotonic_seq_cst_cmpxchg: @@ -13287,7 +13441,9 @@ define amdgpu_kernel void @local_system_one_as_monotonic_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_monotonic_seq_cst_cmpxchg: @@ -13299,7 +13455,9 @@ define amdgpu_kernel void @local_system_one_as_monotonic_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_system_one_as_monotonic_seq_cst_cmpxchg: @@ -13323,7 +13481,9 @@ define amdgpu_kernel void @local_system_one_as_monotonic_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_system_one_as_monotonic_seq_cst_cmpxchg: @@ -13404,7 +13564,9 @@ define amdgpu_kernel void @local_system_one_as_acquire_seq_cst_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_system_one_as_acquire_seq_cst_cmpxchg: @@ -13417,7 +13579,9 @@ define amdgpu_kernel void @local_system_one_as_acquire_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_system_one_as_acquire_seq_cst_cmpxchg: @@ -13454,7 +13618,9 @@ define amdgpu_kernel void @local_system_one_as_acquire_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_acquire_seq_cst_cmpxchg: @@ -13466,7 +13632,9 @@ define amdgpu_kernel void @local_system_one_as_acquire_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_system_one_as_acquire_seq_cst_cmpxchg: @@ -13490,7 +13658,9 @@ define amdgpu_kernel void @local_system_one_as_acquire_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_system_one_as_acquire_seq_cst_cmpxchg: @@ -13571,7 +13741,9 @@ define amdgpu_kernel void @local_system_one_as_release_seq_cst_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_system_one_as_release_seq_cst_cmpxchg: @@ -13584,7 +13756,9 @@ define amdgpu_kernel void @local_system_one_as_release_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_system_one_as_release_seq_cst_cmpxchg: @@ -13621,7 +13795,9 @@ define amdgpu_kernel void @local_system_one_as_release_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_release_seq_cst_cmpxchg: @@ -13633,7 +13809,9 @@ define amdgpu_kernel void @local_system_one_as_release_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_system_one_as_release_seq_cst_cmpxchg: @@ -13657,7 +13835,9 @@ define amdgpu_kernel void @local_system_one_as_release_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_system_one_as_release_seq_cst_cmpxchg: @@ -13738,7 +13918,9 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_seq_cst_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_system_one_as_acq_rel_seq_cst_cmpxchg: @@ -13751,7 +13933,9 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_system_one_as_acq_rel_seq_cst_cmpxchg: @@ -13788,7 +13972,9 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_acq_rel_seq_cst_cmpxchg: @@ -13800,7 +13986,9 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_system_one_as_acq_rel_seq_cst_cmpxchg: @@ -13824,7 +14012,9 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_system_one_as_acq_rel_seq_cst_cmpxchg: @@ -13905,7 +14095,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_seq_cst_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_system_one_as_seq_cst_seq_cst_cmpxchg: @@ -13918,7 +14110,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_system_one_as_seq_cst_seq_cst_cmpxchg: @@ -13955,7 +14149,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_seq_cst_seq_cst_cmpxchg: @@ -13967,7 +14163,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_system_one_as_seq_cst_seq_cst_cmpxchg: @@ -13991,7 +14189,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_system_one_as_seq_cst_seq_cst_cmpxchg: @@ -14284,6 +14484,7 @@ define amdgpu_kernel void @local_system_one_as_acquire_monotonic_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -14301,6 +14502,7 @@ define amdgpu_kernel void @local_system_one_as_acquire_monotonic_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -14348,6 +14550,7 @@ define amdgpu_kernel void @local_system_one_as_acquire_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -14364,6 +14567,7 @@ define amdgpu_kernel void @local_system_one_as_acquire_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -14394,6 +14598,7 @@ define amdgpu_kernel void @local_system_one_as_acquire_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -14494,6 +14699,7 @@ define amdgpu_kernel void @local_system_one_as_release_monotonic_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 @@ -14511,6 +14717,7 @@ define amdgpu_kernel void @local_system_one_as_release_monotonic_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -14558,6 +14765,7 @@ define amdgpu_kernel void @local_system_one_as_release_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 @@ -14574,6 +14782,7 @@ define amdgpu_kernel void @local_system_one_as_release_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -14604,6 +14813,7 @@ define amdgpu_kernel void @local_system_one_as_release_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -14705,7 +14915,9 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -14722,7 +14934,9 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -14769,7 +14983,9 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -14785,7 +15001,9 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -14815,7 +15033,9 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -14916,7 +15136,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -14933,7 +15155,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -14980,7 +15204,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -14996,7 +15222,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -15026,7 +15254,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -15128,6 +15358,7 @@ define amdgpu_kernel void @local_system_one_as_monotonic_acquire_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -15145,6 +15376,7 @@ define amdgpu_kernel void @local_system_one_as_monotonic_acquire_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -15192,6 +15424,7 @@ define amdgpu_kernel void @local_system_one_as_monotonic_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -15208,6 +15441,7 @@ define amdgpu_kernel void @local_system_one_as_monotonic_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -15238,6 +15472,7 @@ define amdgpu_kernel void @local_system_one_as_monotonic_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -15339,6 +15574,7 @@ define amdgpu_kernel void @local_system_one_as_acquire_acquire_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -15356,6 +15592,7 @@ define amdgpu_kernel void @local_system_one_as_acquire_acquire_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -15403,6 +15640,7 @@ define amdgpu_kernel void @local_system_one_as_acquire_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -15419,6 +15657,7 @@ define amdgpu_kernel void @local_system_one_as_acquire_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -15449,6 +15688,7 @@ define amdgpu_kernel void @local_system_one_as_acquire_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -15549,7 +15789,9 @@ define amdgpu_kernel void @local_system_one_as_release_acquire_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -15566,7 +15808,9 @@ define amdgpu_kernel void @local_system_one_as_release_acquire_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -15613,7 +15857,9 @@ define amdgpu_kernel void @local_system_one_as_release_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -15629,7 +15875,9 @@ define amdgpu_kernel void @local_system_one_as_release_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -15659,7 +15907,9 @@ define amdgpu_kernel void @local_system_one_as_release_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -15760,7 +16010,9 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -15777,7 +16029,9 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -15824,7 +16078,9 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -15840,7 +16096,9 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -15870,7 +16128,9 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -15971,7 +16231,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -15988,7 +16250,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -16035,7 +16299,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -16051,7 +16317,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -16081,7 +16349,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -16182,7 +16452,9 @@ define amdgpu_kernel void @local_system_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -16199,7 +16471,9 @@ define amdgpu_kernel void @local_system_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -16246,7 +16520,9 @@ define amdgpu_kernel void @local_system_one_as_monotonic_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -16262,7 +16538,9 @@ define amdgpu_kernel void @local_system_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -16292,7 +16570,9 @@ define amdgpu_kernel void @local_system_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -16393,7 +16673,9 @@ define amdgpu_kernel void @local_system_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -16410,7 +16692,9 @@ define amdgpu_kernel void @local_system_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -16457,7 +16741,9 @@ define amdgpu_kernel void @local_system_one_as_acquire_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -16473,7 +16759,9 @@ define amdgpu_kernel void @local_system_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -16503,7 +16791,9 @@ define amdgpu_kernel void @local_system_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -16604,7 +16894,9 @@ define amdgpu_kernel void @local_system_one_as_release_seq_cst_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -16621,7 +16913,9 @@ define amdgpu_kernel void @local_system_one_as_release_seq_cst_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -16668,7 +16962,9 @@ define amdgpu_kernel void @local_system_one_as_release_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -16684,7 +16980,9 @@ define amdgpu_kernel void @local_system_one_as_release_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -16714,7 +17012,9 @@ define amdgpu_kernel void @local_system_one_as_release_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -16815,7 +17115,9 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -16832,7 +17134,9 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -16879,7 +17183,9 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -16895,7 +17201,9 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -16925,7 +17233,9 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -17026,7 +17336,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -17043,7 +17355,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -17090,7 +17404,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -17106,7 +17422,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -17136,7 +17454,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll index bc2508411ed6b..590dedc85d9c4 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll @@ -28,7 +28,7 @@ define amdgpu_kernel void @local_volatile_load_0( ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: ds_read_b32 v0, v0 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; @@ -43,6 +43,7 @@ define amdgpu_kernel void @local_volatile_load_0( ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ds_read_b32 v2, v0 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -90,7 +91,7 @@ define amdgpu_kernel void @local_volatile_load_0( ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 ; SKIP-CACHE-INV-NEXT: ds_read_b32 v0, v0 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -169,7 +170,7 @@ define amdgpu_kernel void @local_volatile_load_1( ; GFX6-NEXT: v_add_i32_e64 v0, s[4:5], s4, v0 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: ds_read_b32 v0, v0 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; @@ -186,6 +187,7 @@ define amdgpu_kernel void @local_volatile_load_1( ; GFX7-NEXT: v_add_i32_e64 v0, s[6:7], s6, v0 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: ds_read_b32 v2, v0 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -239,7 +241,7 @@ define amdgpu_kernel void @local_volatile_load_1( ; SKIP-CACHE-INV-NEXT: v_add_i32_e64 v0, s[4:5], s4, v0 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: ds_read_b32 v0, v0 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -330,6 +332,7 @@ define amdgpu_kernel void @local_volatile_store_0( ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s0 ; GFX6-NEXT: ds_write_b32 v0, v1 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_volatile_store_0: @@ -343,6 +346,7 @@ define amdgpu_kernel void @local_volatile_store_0( ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s4 ; GFX7-NEXT: ds_write_b32 v0, v1 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_volatile_store_0: @@ -380,6 +384,7 @@ define amdgpu_kernel void @local_volatile_store_0( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: local_volatile_store_0: @@ -461,6 +466,7 @@ define amdgpu_kernel void @local_volatile_store_1( ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s0 ; GFX6-NEXT: ds_write_b32 v0, v1 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_volatile_store_1: @@ -476,6 +482,7 @@ define amdgpu_kernel void @local_volatile_store_1( ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s4 ; GFX7-NEXT: ds_write_b32 v0, v1 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_volatile_store_1: @@ -517,6 +524,7 @@ define amdgpu_kernel void @local_volatile_store_1( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: local_volatile_store_1: @@ -610,7 +618,7 @@ define amdgpu_kernel void @local_volatile_workgroup_acquire_load( ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s1 ; GFX6-NEXT: ds_read_b32 v1, v0 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s0 ; GFX6-NEXT: ds_write_b32 v0, v1 @@ -624,7 +632,7 @@ define amdgpu_kernel void @local_volatile_workgroup_acquire_load( ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: ds_read_b32 v1, v0 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: ds_write_b32 v0, v1 @@ -663,7 +671,7 @@ define amdgpu_kernel void @local_volatile_workgroup_acquire_load( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: ds_read_b32 v1, v0 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 @@ -735,7 +743,7 @@ define amdgpu_kernel void @local_volatile_workgroup_release_store( ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s1 ; GFX6-NEXT: v_mov_b32_e32 v1, s0 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm ; @@ -747,7 +755,7 @@ define amdgpu_kernel void @local_volatile_workgroup_release_store( ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: v_mov_b32_e32 v1, s4 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: ds_write_b32 v0, v1 ; GFX7-NEXT: s_endpgm ; @@ -782,7 +790,7 @@ define amdgpu_kernel void @local_volatile_workgroup_release_store( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-wavefront.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-wavefront.ll index b24622a48a16b..148f3ed0ec152 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-wavefront.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-wavefront.ll @@ -366,6 +366,7 @@ define amdgpu_kernel void @local_wavefront_acquire_load( ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: ds_read_b32 v1, v0 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -380,6 +381,7 @@ define amdgpu_kernel void @local_wavefront_acquire_load( ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: ds_read_b32 v1, v0 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -418,6 +420,7 @@ define amdgpu_kernel void @local_wavefront_acquire_load( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: ds_read_b32 v1, v0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -431,6 +434,7 @@ define amdgpu_kernel void @local_wavefront_acquire_load( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: ds_read_b32 v1, v0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -455,6 +459,7 @@ define amdgpu_kernel void @local_wavefront_acquire_load( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: ds_read_b32 v1, v0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -536,7 +541,9 @@ define amdgpu_kernel void @local_wavefront_seq_cst_load( ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_read_b32 v1, v0 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -550,7 +557,9 @@ define amdgpu_kernel void @local_wavefront_seq_cst_load( ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_read_b32 v1, v0 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -588,7 +597,9 @@ define amdgpu_kernel void @local_wavefront_seq_cst_load( ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_read_b32 v1, v0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -601,7 +612,9 @@ define amdgpu_kernel void @local_wavefront_seq_cst_load( ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_read_b32 v1, v0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -625,7 +638,9 @@ define amdgpu_kernel void @local_wavefront_seq_cst_load( ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_read_b32 v1, v0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -987,6 +1002,7 @@ define amdgpu_kernel void @local_wavefront_release_store( ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm ; @@ -998,6 +1014,7 @@ define amdgpu_kernel void @local_wavefront_release_store( ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: v_mov_b32_e32 v1, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_write_b32 v0, v1 ; GFX7-NEXT: s_endpgm ; @@ -1029,6 +1046,7 @@ define amdgpu_kernel void @local_wavefront_release_store( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -1039,6 +1057,7 @@ define amdgpu_kernel void @local_wavefront_release_store( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -1059,6 +1078,7 @@ define amdgpu_kernel void @local_wavefront_release_store( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -1127,6 +1147,7 @@ define amdgpu_kernel void @local_wavefront_seq_cst_store( ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm ; @@ -1138,6 +1159,7 @@ define amdgpu_kernel void @local_wavefront_seq_cst_store( ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: v_mov_b32_e32 v1, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_write_b32 v0, v1 ; GFX7-NEXT: s_endpgm ; @@ -1169,6 +1191,7 @@ define amdgpu_kernel void @local_wavefront_seq_cst_store( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -1179,6 +1202,7 @@ define amdgpu_kernel void @local_wavefront_seq_cst_store( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -1199,6 +1223,7 @@ define amdgpu_kernel void @local_wavefront_seq_cst_store( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -1408,6 +1433,7 @@ define amdgpu_kernel void @local_wavefront_acquire_atomicrmw( ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_wavefront_acquire_atomicrmw: @@ -1419,6 +1445,7 @@ define amdgpu_kernel void @local_wavefront_acquire_atomicrmw( ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: v_mov_b32_e32 v1, s4 ; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_wavefront_acquire_atomicrmw: @@ -1450,6 +1477,7 @@ define amdgpu_kernel void @local_wavefront_acquire_atomicrmw( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_acquire_atomicrmw: @@ -1460,6 +1488,7 @@ define amdgpu_kernel void @local_wavefront_acquire_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_wavefront_acquire_atomicrmw: @@ -1480,6 +1509,7 @@ define amdgpu_kernel void @local_wavefront_acquire_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_wavefront_acquire_atomicrmw: @@ -1547,6 +1577,7 @@ define amdgpu_kernel void @local_wavefront_release_atomicrmw( ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX6-NEXT: s_endpgm ; @@ -1558,6 +1589,7 @@ define amdgpu_kernel void @local_wavefront_release_atomicrmw( ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: v_mov_b32_e32 v1, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX7-NEXT: s_endpgm ; @@ -1589,6 +1621,7 @@ define amdgpu_kernel void @local_wavefront_release_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -1599,6 +1632,7 @@ define amdgpu_kernel void @local_wavefront_release_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -1619,6 +1653,7 @@ define amdgpu_kernel void @local_wavefront_release_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -1687,7 +1722,9 @@ define amdgpu_kernel void @local_wavefront_acq_rel_atomicrmw( ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_wavefront_acq_rel_atomicrmw: @@ -1698,7 +1735,9 @@ define amdgpu_kernel void @local_wavefront_acq_rel_atomicrmw( ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: v_mov_b32_e32 v1, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_wavefront_acq_rel_atomicrmw: @@ -1729,7 +1768,9 @@ define amdgpu_kernel void @local_wavefront_acq_rel_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_acq_rel_atomicrmw: @@ -1739,7 +1780,9 @@ define amdgpu_kernel void @local_wavefront_acq_rel_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_wavefront_acq_rel_atomicrmw: @@ -1759,7 +1802,9 @@ define amdgpu_kernel void @local_wavefront_acq_rel_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_wavefront_acq_rel_atomicrmw: @@ -1827,7 +1872,9 @@ define amdgpu_kernel void @local_wavefront_seq_cst_atomicrmw( ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_wavefront_seq_cst_atomicrmw: @@ -1838,7 +1885,9 @@ define amdgpu_kernel void @local_wavefront_seq_cst_atomicrmw( ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: v_mov_b32_e32 v1, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_wavefront_seq_cst_atomicrmw: @@ -1869,7 +1918,9 @@ define amdgpu_kernel void @local_wavefront_seq_cst_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_seq_cst_atomicrmw: @@ -1879,7 +1930,9 @@ define amdgpu_kernel void @local_wavefront_seq_cst_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_wavefront_seq_cst_atomicrmw: @@ -1899,7 +1952,9 @@ define amdgpu_kernel void @local_wavefront_seq_cst_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_wavefront_seq_cst_atomicrmw: @@ -1968,6 +2023,7 @@ define amdgpu_kernel void @local_wavefront_acquire_ret_atomicrmw( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -1983,6 +2039,7 @@ define amdgpu_kernel void @local_wavefront_acquire_ret_atomicrmw( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -2024,6 +2081,7 @@ define amdgpu_kernel void @local_wavefront_acquire_ret_atomicrmw( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -2038,6 +2096,7 @@ define amdgpu_kernel void @local_wavefront_acquire_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -2064,6 +2123,7 @@ define amdgpu_kernel void @local_wavefront_acquire_ret_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -2150,7 +2210,9 @@ define amdgpu_kernel void @local_wavefront_acq_rel_ret_atomicrmw( ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -2165,7 +2227,9 @@ define amdgpu_kernel void @local_wavefront_acq_rel_ret_atomicrmw( ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -2206,7 +2270,9 @@ define amdgpu_kernel void @local_wavefront_acq_rel_ret_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -2220,7 +2286,9 @@ define amdgpu_kernel void @local_wavefront_acq_rel_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -2246,7 +2314,9 @@ define amdgpu_kernel void @local_wavefront_acq_rel_ret_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -2333,7 +2403,9 @@ define amdgpu_kernel void @local_wavefront_seq_cst_ret_atomicrmw( ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -2348,7 +2420,9 @@ define amdgpu_kernel void @local_wavefront_seq_cst_ret_atomicrmw( ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -2389,7 +2463,9 @@ define amdgpu_kernel void @local_wavefront_seq_cst_ret_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -2403,7 +2479,9 @@ define amdgpu_kernel void @local_wavefront_seq_cst_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -2429,7 +2507,9 @@ define amdgpu_kernel void @local_wavefront_seq_cst_ret_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -2686,6 +2766,7 @@ define amdgpu_kernel void @local_wavefront_acquire_monotonic_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_wavefront_acquire_monotonic_cmpxchg: @@ -2699,6 +2780,7 @@ define amdgpu_kernel void @local_wavefront_acquire_monotonic_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_wavefront_acquire_monotonic_cmpxchg: @@ -2736,6 +2818,7 @@ define amdgpu_kernel void @local_wavefront_acquire_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_acquire_monotonic_cmpxchg: @@ -2748,6 +2831,7 @@ define amdgpu_kernel void @local_wavefront_acquire_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_wavefront_acquire_monotonic_cmpxchg: @@ -2772,6 +2856,7 @@ define amdgpu_kernel void @local_wavefront_acquire_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_wavefront_acquire_monotonic_cmpxchg: @@ -2852,6 +2937,7 @@ define amdgpu_kernel void @local_wavefront_release_monotonic_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX6-NEXT: s_endpgm ; @@ -2865,6 +2951,7 @@ define amdgpu_kernel void @local_wavefront_release_monotonic_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX7-NEXT: s_endpgm ; @@ -2902,6 +2989,7 @@ define amdgpu_kernel void @local_wavefront_release_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -2914,6 +3002,7 @@ define amdgpu_kernel void @local_wavefront_release_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -2938,6 +3027,7 @@ define amdgpu_kernel void @local_wavefront_release_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -3019,7 +3109,9 @@ define amdgpu_kernel void @local_wavefront_acq_rel_monotonic_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_wavefront_acq_rel_monotonic_cmpxchg: @@ -3032,7 +3124,9 @@ define amdgpu_kernel void @local_wavefront_acq_rel_monotonic_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_wavefront_acq_rel_monotonic_cmpxchg: @@ -3069,7 +3163,9 @@ define amdgpu_kernel void @local_wavefront_acq_rel_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_acq_rel_monotonic_cmpxchg: @@ -3081,7 +3177,9 @@ define amdgpu_kernel void @local_wavefront_acq_rel_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_wavefront_acq_rel_monotonic_cmpxchg: @@ -3105,7 +3203,9 @@ define amdgpu_kernel void @local_wavefront_acq_rel_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_wavefront_acq_rel_monotonic_cmpxchg: @@ -3186,7 +3286,9 @@ define amdgpu_kernel void @local_wavefront_seq_cst_monotonic_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_wavefront_seq_cst_monotonic_cmpxchg: @@ -3199,7 +3301,9 @@ define amdgpu_kernel void @local_wavefront_seq_cst_monotonic_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_wavefront_seq_cst_monotonic_cmpxchg: @@ -3236,7 +3340,9 @@ define amdgpu_kernel void @local_wavefront_seq_cst_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_seq_cst_monotonic_cmpxchg: @@ -3248,7 +3354,9 @@ define amdgpu_kernel void @local_wavefront_seq_cst_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_wavefront_seq_cst_monotonic_cmpxchg: @@ -3272,7 +3380,9 @@ define amdgpu_kernel void @local_wavefront_seq_cst_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_wavefront_seq_cst_monotonic_cmpxchg: @@ -3354,6 +3464,7 @@ define amdgpu_kernel void @local_wavefront_monotonic_acquire_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_wavefront_monotonic_acquire_cmpxchg: @@ -3367,6 +3478,7 @@ define amdgpu_kernel void @local_wavefront_monotonic_acquire_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_wavefront_monotonic_acquire_cmpxchg: @@ -3404,6 +3516,7 @@ define amdgpu_kernel void @local_wavefront_monotonic_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_monotonic_acquire_cmpxchg: @@ -3416,6 +3529,7 @@ define amdgpu_kernel void @local_wavefront_monotonic_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_wavefront_monotonic_acquire_cmpxchg: @@ -3440,6 +3554,7 @@ define amdgpu_kernel void @local_wavefront_monotonic_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_wavefront_monotonic_acquire_cmpxchg: @@ -3521,6 +3636,7 @@ define amdgpu_kernel void @local_wavefront_acquire_acquire_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_wavefront_acquire_acquire_cmpxchg: @@ -3534,6 +3650,7 @@ define amdgpu_kernel void @local_wavefront_acquire_acquire_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_wavefront_acquire_acquire_cmpxchg: @@ -3571,6 +3688,7 @@ define amdgpu_kernel void @local_wavefront_acquire_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_acquire_acquire_cmpxchg: @@ -3583,6 +3701,7 @@ define amdgpu_kernel void @local_wavefront_acquire_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_wavefront_acquire_acquire_cmpxchg: @@ -3607,6 +3726,7 @@ define amdgpu_kernel void @local_wavefront_acquire_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_wavefront_acquire_acquire_cmpxchg: @@ -3687,7 +3807,9 @@ define amdgpu_kernel void @local_wavefront_release_acquire_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_wavefront_release_acquire_cmpxchg: @@ -3700,7 +3822,9 @@ define amdgpu_kernel void @local_wavefront_release_acquire_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_wavefront_release_acquire_cmpxchg: @@ -3737,7 +3861,9 @@ define amdgpu_kernel void @local_wavefront_release_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_release_acquire_cmpxchg: @@ -3749,7 +3875,9 @@ define amdgpu_kernel void @local_wavefront_release_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_wavefront_release_acquire_cmpxchg: @@ -3773,7 +3901,9 @@ define amdgpu_kernel void @local_wavefront_release_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_wavefront_release_acquire_cmpxchg: @@ -3854,7 +3984,9 @@ define amdgpu_kernel void @local_wavefront_acq_rel_acquire_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_wavefront_acq_rel_acquire_cmpxchg: @@ -3867,7 +3999,9 @@ define amdgpu_kernel void @local_wavefront_acq_rel_acquire_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_wavefront_acq_rel_acquire_cmpxchg: @@ -3904,7 +4038,9 @@ define amdgpu_kernel void @local_wavefront_acq_rel_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_acq_rel_acquire_cmpxchg: @@ -3916,7 +4052,9 @@ define amdgpu_kernel void @local_wavefront_acq_rel_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_wavefront_acq_rel_acquire_cmpxchg: @@ -3940,7 +4078,9 @@ define amdgpu_kernel void @local_wavefront_acq_rel_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_wavefront_acq_rel_acquire_cmpxchg: @@ -4021,7 +4161,9 @@ define amdgpu_kernel void @local_wavefront_seq_cst_acquire_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_wavefront_seq_cst_acquire_cmpxchg: @@ -4034,7 +4176,9 @@ define amdgpu_kernel void @local_wavefront_seq_cst_acquire_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_wavefront_seq_cst_acquire_cmpxchg: @@ -4071,7 +4215,9 @@ define amdgpu_kernel void @local_wavefront_seq_cst_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_seq_cst_acquire_cmpxchg: @@ -4083,7 +4229,9 @@ define amdgpu_kernel void @local_wavefront_seq_cst_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_wavefront_seq_cst_acquire_cmpxchg: @@ -4107,7 +4255,9 @@ define amdgpu_kernel void @local_wavefront_seq_cst_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_wavefront_seq_cst_acquire_cmpxchg: @@ -4188,7 +4338,9 @@ define amdgpu_kernel void @local_wavefront_monotonic_seq_cst_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_wavefront_monotonic_seq_cst_cmpxchg: @@ -4201,7 +4353,9 @@ define amdgpu_kernel void @local_wavefront_monotonic_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_wavefront_monotonic_seq_cst_cmpxchg: @@ -4238,7 +4392,9 @@ define amdgpu_kernel void @local_wavefront_monotonic_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_monotonic_seq_cst_cmpxchg: @@ -4250,7 +4406,9 @@ define amdgpu_kernel void @local_wavefront_monotonic_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_wavefront_monotonic_seq_cst_cmpxchg: @@ -4274,7 +4432,9 @@ define amdgpu_kernel void @local_wavefront_monotonic_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_wavefront_monotonic_seq_cst_cmpxchg: @@ -4355,7 +4515,9 @@ define amdgpu_kernel void @local_wavefront_acquire_seq_cst_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_wavefront_acquire_seq_cst_cmpxchg: @@ -4368,7 +4530,9 @@ define amdgpu_kernel void @local_wavefront_acquire_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_wavefront_acquire_seq_cst_cmpxchg: @@ -4405,7 +4569,9 @@ define amdgpu_kernel void @local_wavefront_acquire_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_acquire_seq_cst_cmpxchg: @@ -4417,7 +4583,9 @@ define amdgpu_kernel void @local_wavefront_acquire_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_wavefront_acquire_seq_cst_cmpxchg: @@ -4441,7 +4609,9 @@ define amdgpu_kernel void @local_wavefront_acquire_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_wavefront_acquire_seq_cst_cmpxchg: @@ -4522,7 +4692,9 @@ define amdgpu_kernel void @local_wavefront_release_seq_cst_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_wavefront_release_seq_cst_cmpxchg: @@ -4535,7 +4707,9 @@ define amdgpu_kernel void @local_wavefront_release_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_wavefront_release_seq_cst_cmpxchg: @@ -4572,7 +4746,9 @@ define amdgpu_kernel void @local_wavefront_release_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_release_seq_cst_cmpxchg: @@ -4584,7 +4760,9 @@ define amdgpu_kernel void @local_wavefront_release_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_wavefront_release_seq_cst_cmpxchg: @@ -4608,7 +4786,9 @@ define amdgpu_kernel void @local_wavefront_release_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_wavefront_release_seq_cst_cmpxchg: @@ -4689,7 +4869,9 @@ define amdgpu_kernel void @local_wavefront_acq_rel_seq_cst_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_wavefront_acq_rel_seq_cst_cmpxchg: @@ -4702,7 +4884,9 @@ define amdgpu_kernel void @local_wavefront_acq_rel_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_wavefront_acq_rel_seq_cst_cmpxchg: @@ -4739,7 +4923,9 @@ define amdgpu_kernel void @local_wavefront_acq_rel_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_acq_rel_seq_cst_cmpxchg: @@ -4751,7 +4937,9 @@ define amdgpu_kernel void @local_wavefront_acq_rel_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_wavefront_acq_rel_seq_cst_cmpxchg: @@ -4775,7 +4963,9 @@ define amdgpu_kernel void @local_wavefront_acq_rel_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_wavefront_acq_rel_seq_cst_cmpxchg: @@ -4856,7 +5046,9 @@ define amdgpu_kernel void @local_wavefront_seq_cst_seq_cst_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_wavefront_seq_cst_seq_cst_cmpxchg: @@ -4869,7 +5061,9 @@ define amdgpu_kernel void @local_wavefront_seq_cst_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_wavefront_seq_cst_seq_cst_cmpxchg: @@ -4906,7 +5100,9 @@ define amdgpu_kernel void @local_wavefront_seq_cst_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_seq_cst_seq_cst_cmpxchg: @@ -4918,7 +5114,9 @@ define amdgpu_kernel void @local_wavefront_seq_cst_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_wavefront_seq_cst_seq_cst_cmpxchg: @@ -4942,7 +5140,9 @@ define amdgpu_kernel void @local_wavefront_seq_cst_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_wavefront_seq_cst_seq_cst_cmpxchg: @@ -5235,6 +5435,7 @@ define amdgpu_kernel void @local_wavefront_acquire_monotonic_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -5252,6 +5453,7 @@ define amdgpu_kernel void @local_wavefront_acquire_monotonic_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -5299,6 +5501,7 @@ define amdgpu_kernel void @local_wavefront_acquire_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -5315,6 +5518,7 @@ define amdgpu_kernel void @local_wavefront_acquire_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -5345,6 +5549,7 @@ define amdgpu_kernel void @local_wavefront_acquire_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -5445,6 +5650,7 @@ define amdgpu_kernel void @local_wavefront_release_monotonic_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 @@ -5462,6 +5668,7 @@ define amdgpu_kernel void @local_wavefront_release_monotonic_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -5509,6 +5716,7 @@ define amdgpu_kernel void @local_wavefront_release_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 @@ -5525,6 +5733,7 @@ define amdgpu_kernel void @local_wavefront_release_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -5555,6 +5764,7 @@ define amdgpu_kernel void @local_wavefront_release_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -5656,7 +5866,9 @@ define amdgpu_kernel void @local_wavefront_acq_rel_monotonic_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -5673,7 +5885,9 @@ define amdgpu_kernel void @local_wavefront_acq_rel_monotonic_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -5720,7 +5934,9 @@ define amdgpu_kernel void @local_wavefront_acq_rel_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -5736,7 +5952,9 @@ define amdgpu_kernel void @local_wavefront_acq_rel_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -5766,7 +5984,9 @@ define amdgpu_kernel void @local_wavefront_acq_rel_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -5867,7 +6087,9 @@ define amdgpu_kernel void @local_wavefront_seq_cst_monotonic_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -5884,7 +6106,9 @@ define amdgpu_kernel void @local_wavefront_seq_cst_monotonic_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -5931,7 +6155,9 @@ define amdgpu_kernel void @local_wavefront_seq_cst_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -5947,7 +6173,9 @@ define amdgpu_kernel void @local_wavefront_seq_cst_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -5977,7 +6205,9 @@ define amdgpu_kernel void @local_wavefront_seq_cst_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -6079,6 +6309,7 @@ define amdgpu_kernel void @local_wavefront_monotonic_acquire_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -6096,6 +6327,7 @@ define amdgpu_kernel void @local_wavefront_monotonic_acquire_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -6143,6 +6375,7 @@ define amdgpu_kernel void @local_wavefront_monotonic_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -6159,6 +6392,7 @@ define amdgpu_kernel void @local_wavefront_monotonic_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -6189,6 +6423,7 @@ define amdgpu_kernel void @local_wavefront_monotonic_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -6290,6 +6525,7 @@ define amdgpu_kernel void @local_wavefront_acquire_acquire_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -6307,6 +6543,7 @@ define amdgpu_kernel void @local_wavefront_acquire_acquire_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -6354,6 +6591,7 @@ define amdgpu_kernel void @local_wavefront_acquire_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -6370,6 +6608,7 @@ define amdgpu_kernel void @local_wavefront_acquire_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -6400,6 +6639,7 @@ define amdgpu_kernel void @local_wavefront_acquire_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -6500,7 +6740,9 @@ define amdgpu_kernel void @local_wavefront_release_acquire_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -6517,7 +6759,9 @@ define amdgpu_kernel void @local_wavefront_release_acquire_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -6564,7 +6808,9 @@ define amdgpu_kernel void @local_wavefront_release_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -6580,7 +6826,9 @@ define amdgpu_kernel void @local_wavefront_release_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -6610,7 +6858,9 @@ define amdgpu_kernel void @local_wavefront_release_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -6711,7 +6961,9 @@ define amdgpu_kernel void @local_wavefront_acq_rel_acquire_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -6728,7 +6980,9 @@ define amdgpu_kernel void @local_wavefront_acq_rel_acquire_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -6775,7 +7029,9 @@ define amdgpu_kernel void @local_wavefront_acq_rel_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -6791,7 +7047,9 @@ define amdgpu_kernel void @local_wavefront_acq_rel_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -6821,7 +7079,9 @@ define amdgpu_kernel void @local_wavefront_acq_rel_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -6922,7 +7182,9 @@ define amdgpu_kernel void @local_wavefront_seq_cst_acquire_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -6939,7 +7201,9 @@ define amdgpu_kernel void @local_wavefront_seq_cst_acquire_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -6986,7 +7250,9 @@ define amdgpu_kernel void @local_wavefront_seq_cst_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -7002,7 +7268,9 @@ define amdgpu_kernel void @local_wavefront_seq_cst_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -7032,7 +7300,9 @@ define amdgpu_kernel void @local_wavefront_seq_cst_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -7133,7 +7403,9 @@ define amdgpu_kernel void @local_wavefront_monotonic_seq_cst_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -7150,7 +7422,9 @@ define amdgpu_kernel void @local_wavefront_monotonic_seq_cst_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -7197,7 +7471,9 @@ define amdgpu_kernel void @local_wavefront_monotonic_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -7213,7 +7489,9 @@ define amdgpu_kernel void @local_wavefront_monotonic_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -7243,7 +7521,9 @@ define amdgpu_kernel void @local_wavefront_monotonic_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -7344,7 +7624,9 @@ define amdgpu_kernel void @local_wavefront_acquire_seq_cst_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -7361,7 +7643,9 @@ define amdgpu_kernel void @local_wavefront_acquire_seq_cst_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -7408,7 +7692,9 @@ define amdgpu_kernel void @local_wavefront_acquire_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -7424,7 +7710,9 @@ define amdgpu_kernel void @local_wavefront_acquire_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -7454,7 +7742,9 @@ define amdgpu_kernel void @local_wavefront_acquire_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -7555,7 +7845,9 @@ define amdgpu_kernel void @local_wavefront_release_seq_cst_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -7572,7 +7864,9 @@ define amdgpu_kernel void @local_wavefront_release_seq_cst_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -7619,7 +7913,9 @@ define amdgpu_kernel void @local_wavefront_release_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -7635,7 +7931,9 @@ define amdgpu_kernel void @local_wavefront_release_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -7665,7 +7963,9 @@ define amdgpu_kernel void @local_wavefront_release_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -7766,7 +8066,9 @@ define amdgpu_kernel void @local_wavefront_acq_rel_seq_cst_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -7783,7 +8085,9 @@ define amdgpu_kernel void @local_wavefront_acq_rel_seq_cst_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -7830,7 +8134,9 @@ define amdgpu_kernel void @local_wavefront_acq_rel_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -7846,7 +8152,9 @@ define amdgpu_kernel void @local_wavefront_acq_rel_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -7876,7 +8184,9 @@ define amdgpu_kernel void @local_wavefront_acq_rel_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -7977,7 +8287,9 @@ define amdgpu_kernel void @local_wavefront_seq_cst_seq_cst_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -7994,7 +8306,9 @@ define amdgpu_kernel void @local_wavefront_seq_cst_seq_cst_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -8041,7 +8355,9 @@ define amdgpu_kernel void @local_wavefront_seq_cst_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -8057,7 +8373,9 @@ define amdgpu_kernel void @local_wavefront_seq_cst_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -8087,7 +8405,9 @@ define amdgpu_kernel void @local_wavefront_seq_cst_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -8529,6 +8849,7 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_load( ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: ds_read_b32 v1, v0 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -8543,6 +8864,7 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_load( ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: ds_read_b32 v1, v0 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -8581,6 +8903,7 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_load( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: ds_read_b32 v1, v0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -8594,6 +8917,7 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_load( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: ds_read_b32 v1, v0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -8618,6 +8942,7 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_load( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: ds_read_b32 v1, v0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -8699,7 +9024,9 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_load( ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_read_b32 v1, v0 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -8713,7 +9040,9 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_load( ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_read_b32 v1, v0 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -8751,7 +9080,9 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_load( ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_read_b32 v1, v0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -8764,7 +9095,9 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_load( ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_read_b32 v1, v0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -8788,7 +9121,9 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_load( ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_read_b32 v1, v0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -9150,6 +9485,7 @@ define amdgpu_kernel void @local_wavefront_one_as_release_store( ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm ; @@ -9161,6 +9497,7 @@ define amdgpu_kernel void @local_wavefront_one_as_release_store( ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: v_mov_b32_e32 v1, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_write_b32 v0, v1 ; GFX7-NEXT: s_endpgm ; @@ -9192,6 +9529,7 @@ define amdgpu_kernel void @local_wavefront_one_as_release_store( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -9202,6 +9540,7 @@ define amdgpu_kernel void @local_wavefront_one_as_release_store( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -9222,6 +9561,7 @@ define amdgpu_kernel void @local_wavefront_one_as_release_store( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -9290,6 +9630,7 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_store( ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm ; @@ -9301,6 +9642,7 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_store( ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: v_mov_b32_e32 v1, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_write_b32 v0, v1 ; GFX7-NEXT: s_endpgm ; @@ -9332,6 +9674,7 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_store( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -9342,6 +9685,7 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_store( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -9362,6 +9706,7 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_store( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -9571,6 +9916,7 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_atomicrmw( ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_wavefront_one_as_acquire_atomicrmw: @@ -9582,6 +9928,7 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_atomicrmw( ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: v_mov_b32_e32 v1, s4 ; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_wavefront_one_as_acquire_atomicrmw: @@ -9613,6 +9960,7 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_atomicrmw( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_acquire_atomicrmw: @@ -9623,6 +9971,7 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_acquire_atomicrmw: @@ -9643,6 +9992,7 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_wavefront_one_as_acquire_atomicrmw: @@ -9710,6 +10060,7 @@ define amdgpu_kernel void @local_wavefront_one_as_release_atomicrmw( ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX6-NEXT: s_endpgm ; @@ -9721,6 +10072,7 @@ define amdgpu_kernel void @local_wavefront_one_as_release_atomicrmw( ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: v_mov_b32_e32 v1, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX7-NEXT: s_endpgm ; @@ -9752,6 +10104,7 @@ define amdgpu_kernel void @local_wavefront_one_as_release_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -9762,6 +10115,7 @@ define amdgpu_kernel void @local_wavefront_one_as_release_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -9782,6 +10136,7 @@ define amdgpu_kernel void @local_wavefront_one_as_release_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -9850,7 +10205,9 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_atomicrmw( ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_wavefront_one_as_acq_rel_atomicrmw: @@ -9861,7 +10218,9 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_atomicrmw( ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: v_mov_b32_e32 v1, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_wavefront_one_as_acq_rel_atomicrmw: @@ -9892,7 +10251,9 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_acq_rel_atomicrmw: @@ -9902,7 +10263,9 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_acq_rel_atomicrmw: @@ -9922,7 +10285,9 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_wavefront_one_as_acq_rel_atomicrmw: @@ -9990,7 +10355,9 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_atomicrmw( ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_wavefront_one_as_seq_cst_atomicrmw: @@ -10001,7 +10368,9 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_atomicrmw( ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: v_mov_b32_e32 v1, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_wavefront_one_as_seq_cst_atomicrmw: @@ -10032,7 +10401,9 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_seq_cst_atomicrmw: @@ -10042,7 +10413,9 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_seq_cst_atomicrmw: @@ -10062,7 +10435,9 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_wavefront_one_as_seq_cst_atomicrmw: @@ -10131,6 +10506,7 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_ret_atomicrmw( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -10146,6 +10522,7 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_ret_atomicrmw( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -10187,6 +10564,7 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_ret_atomicrmw( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -10201,6 +10579,7 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -10227,6 +10606,7 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_ret_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -10313,7 +10693,9 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_ret_atomicrmw( ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -10328,7 +10710,9 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_ret_atomicrmw( ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -10369,7 +10753,9 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_ret_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -10383,7 +10769,9 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -10409,7 +10797,9 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_ret_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -10496,7 +10886,9 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_ret_atomicrmw( ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -10511,7 +10903,9 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_ret_atomicrmw( ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -10552,7 +10946,9 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_ret_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -10566,7 +10962,9 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -10592,7 +10990,9 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_ret_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -10849,6 +11249,7 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_monotonic_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_wavefront_one_as_acquire_monotonic_cmpxchg: @@ -10862,6 +11263,7 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_monotonic_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_wavefront_one_as_acquire_monotonic_cmpxchg: @@ -10899,6 +11301,7 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_acquire_monotonic_cmpxchg: @@ -10911,6 +11314,7 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_acquire_monotonic_cmpxchg: @@ -10935,6 +11339,7 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_wavefront_one_as_acquire_monotonic_cmpxchg: @@ -11015,6 +11420,7 @@ define amdgpu_kernel void @local_wavefront_one_as_release_monotonic_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX6-NEXT: s_endpgm ; @@ -11028,6 +11434,7 @@ define amdgpu_kernel void @local_wavefront_one_as_release_monotonic_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX7-NEXT: s_endpgm ; @@ -11065,6 +11472,7 @@ define amdgpu_kernel void @local_wavefront_one_as_release_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -11077,6 +11485,7 @@ define amdgpu_kernel void @local_wavefront_one_as_release_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -11101,6 +11510,7 @@ define amdgpu_kernel void @local_wavefront_one_as_release_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -11182,7 +11592,9 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_monotonic_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_wavefront_one_as_acq_rel_monotonic_cmpxchg: @@ -11195,7 +11607,9 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_monotonic_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_wavefront_one_as_acq_rel_monotonic_cmpxchg: @@ -11232,7 +11646,9 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_acq_rel_monotonic_cmpxchg: @@ -11244,7 +11660,9 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_acq_rel_monotonic_cmpxchg: @@ -11268,7 +11686,9 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_wavefront_one_as_acq_rel_monotonic_cmpxchg: @@ -11349,7 +11769,9 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_monotonic_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_wavefront_one_as_seq_cst_monotonic_cmpxchg: @@ -11362,7 +11784,9 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_monotonic_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_wavefront_one_as_seq_cst_monotonic_cmpxchg: @@ -11399,7 +11823,9 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_seq_cst_monotonic_cmpxchg: @@ -11411,7 +11837,9 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_seq_cst_monotonic_cmpxchg: @@ -11435,7 +11863,9 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_wavefront_one_as_seq_cst_monotonic_cmpxchg: @@ -11517,6 +11947,7 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_acquire_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_wavefront_one_as_monotonic_acquire_cmpxchg: @@ -11530,6 +11961,7 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_acquire_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_wavefront_one_as_monotonic_acquire_cmpxchg: @@ -11567,6 +11999,7 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_monotonic_acquire_cmpxchg: @@ -11579,6 +12012,7 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_monotonic_acquire_cmpxchg: @@ -11603,6 +12037,7 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_wavefront_one_as_monotonic_acquire_cmpxchg: @@ -11684,6 +12119,7 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_acquire_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_wavefront_one_as_acquire_acquire_cmpxchg: @@ -11697,6 +12133,7 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_acquire_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_wavefront_one_as_acquire_acquire_cmpxchg: @@ -11734,6 +12171,7 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_acquire_acquire_cmpxchg: @@ -11746,6 +12184,7 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_acquire_acquire_cmpxchg: @@ -11770,6 +12209,7 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_wavefront_one_as_acquire_acquire_cmpxchg: @@ -11850,7 +12290,9 @@ define amdgpu_kernel void @local_wavefront_one_as_release_acquire_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_wavefront_one_as_release_acquire_cmpxchg: @@ -11863,7 +12305,9 @@ define amdgpu_kernel void @local_wavefront_one_as_release_acquire_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_wavefront_one_as_release_acquire_cmpxchg: @@ -11900,7 +12344,9 @@ define amdgpu_kernel void @local_wavefront_one_as_release_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_release_acquire_cmpxchg: @@ -11912,7 +12358,9 @@ define amdgpu_kernel void @local_wavefront_one_as_release_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_release_acquire_cmpxchg: @@ -11936,7 +12384,9 @@ define amdgpu_kernel void @local_wavefront_one_as_release_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_wavefront_one_as_release_acquire_cmpxchg: @@ -12017,7 +12467,9 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_acquire_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_wavefront_one_as_acq_rel_acquire_cmpxchg: @@ -12030,7 +12482,9 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_acquire_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_wavefront_one_as_acq_rel_acquire_cmpxchg: @@ -12067,7 +12521,9 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_acq_rel_acquire_cmpxchg: @@ -12079,7 +12535,9 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_acq_rel_acquire_cmpxchg: @@ -12103,7 +12561,9 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_wavefront_one_as_acq_rel_acquire_cmpxchg: @@ -12184,7 +12644,9 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_acquire_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_wavefront_one_as_seq_cst_acquire_cmpxchg: @@ -12197,7 +12659,9 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_acquire_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_wavefront_one_as_seq_cst_acquire_cmpxchg: @@ -12234,7 +12698,9 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_seq_cst_acquire_cmpxchg: @@ -12246,7 +12712,9 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_seq_cst_acquire_cmpxchg: @@ -12270,7 +12738,9 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_wavefront_one_as_seq_cst_acquire_cmpxchg: @@ -12351,7 +12821,9 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_seq_cst_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_wavefront_one_as_monotonic_seq_cst_cmpxchg: @@ -12364,7 +12836,9 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_wavefront_one_as_monotonic_seq_cst_cmpxchg: @@ -12401,7 +12875,9 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_monotonic_seq_cst_cmpxchg: @@ -12413,7 +12889,9 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_monotonic_seq_cst_cmpxchg: @@ -12437,7 +12915,9 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_wavefront_one_as_monotonic_seq_cst_cmpxchg: @@ -12518,7 +12998,9 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_seq_cst_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_wavefront_one_as_acquire_seq_cst_cmpxchg: @@ -12531,7 +13013,9 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_wavefront_one_as_acquire_seq_cst_cmpxchg: @@ -12568,7 +13052,9 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_acquire_seq_cst_cmpxchg: @@ -12580,7 +13066,9 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_acquire_seq_cst_cmpxchg: @@ -12604,7 +13092,9 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_wavefront_one_as_acquire_seq_cst_cmpxchg: @@ -12685,7 +13175,9 @@ define amdgpu_kernel void @local_wavefront_one_as_release_seq_cst_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_wavefront_one_as_release_seq_cst_cmpxchg: @@ -12698,7 +13190,9 @@ define amdgpu_kernel void @local_wavefront_one_as_release_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_wavefront_one_as_release_seq_cst_cmpxchg: @@ -12735,7 +13229,9 @@ define amdgpu_kernel void @local_wavefront_one_as_release_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_release_seq_cst_cmpxchg: @@ -12747,7 +13243,9 @@ define amdgpu_kernel void @local_wavefront_one_as_release_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_release_seq_cst_cmpxchg: @@ -12771,7 +13269,9 @@ define amdgpu_kernel void @local_wavefront_one_as_release_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_wavefront_one_as_release_seq_cst_cmpxchg: @@ -12852,7 +13352,9 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_seq_cst_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_wavefront_one_as_acq_rel_seq_cst_cmpxchg: @@ -12865,7 +13367,9 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_wavefront_one_as_acq_rel_seq_cst_cmpxchg: @@ -12902,7 +13406,9 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_acq_rel_seq_cst_cmpxchg: @@ -12914,7 +13420,9 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_acq_rel_seq_cst_cmpxchg: @@ -12938,7 +13446,9 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_wavefront_one_as_acq_rel_seq_cst_cmpxchg: @@ -13019,7 +13529,9 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_seq_cst_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_wavefront_one_as_seq_cst_seq_cst_cmpxchg: @@ -13032,7 +13544,9 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_wavefront_one_as_seq_cst_seq_cst_cmpxchg: @@ -13069,7 +13583,9 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_seq_cst_seq_cst_cmpxchg: @@ -13081,7 +13597,9 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_seq_cst_seq_cst_cmpxchg: @@ -13105,7 +13623,9 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_wavefront_one_as_seq_cst_seq_cst_cmpxchg: @@ -13398,6 +13918,7 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_monotonic_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -13415,6 +13936,7 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_monotonic_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -13462,6 +13984,7 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -13478,6 +14001,7 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -13508,6 +14032,7 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -13608,6 +14133,7 @@ define amdgpu_kernel void @local_wavefront_one_as_release_monotonic_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 @@ -13625,6 +14151,7 @@ define amdgpu_kernel void @local_wavefront_one_as_release_monotonic_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -13672,6 +14199,7 @@ define amdgpu_kernel void @local_wavefront_one_as_release_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 @@ -13688,6 +14216,7 @@ define amdgpu_kernel void @local_wavefront_one_as_release_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -13718,6 +14247,7 @@ define amdgpu_kernel void @local_wavefront_one_as_release_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -13819,7 +14349,9 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -13836,7 +14368,9 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -13883,7 +14417,9 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -13899,7 +14435,9 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -13929,7 +14467,9 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -14030,7 +14570,9 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -14047,7 +14589,9 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -14094,7 +14638,9 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -14110,7 +14656,9 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -14140,7 +14688,9 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -14242,6 +14792,7 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_acquire_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -14259,6 +14810,7 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_acquire_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -14306,6 +14858,7 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -14322,6 +14875,7 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -14352,6 +14906,7 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -14453,6 +15008,7 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_acquire_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -14470,6 +15026,7 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_acquire_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -14517,6 +15074,7 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -14533,6 +15091,7 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -14563,6 +15122,7 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -14663,7 +15223,9 @@ define amdgpu_kernel void @local_wavefront_one_as_release_acquire_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -14680,7 +15242,9 @@ define amdgpu_kernel void @local_wavefront_one_as_release_acquire_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -14727,7 +15291,9 @@ define amdgpu_kernel void @local_wavefront_one_as_release_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -14743,7 +15309,9 @@ define amdgpu_kernel void @local_wavefront_one_as_release_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -14773,7 +15341,9 @@ define amdgpu_kernel void @local_wavefront_one_as_release_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -14874,7 +15444,9 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -14891,7 +15463,9 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -14938,7 +15512,9 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -14954,7 +15530,9 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -14984,7 +15562,9 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -15085,7 +15665,9 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -15102,7 +15684,9 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -15149,7 +15733,9 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -15165,7 +15751,9 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -15195,7 +15783,9 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -15296,7 +15886,9 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -15313,7 +15905,9 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -15360,7 +15954,9 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -15376,7 +15972,9 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -15406,7 +16004,9 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -15507,7 +16107,9 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -15524,7 +16126,9 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -15571,7 +16175,9 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -15587,7 +16193,9 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -15617,7 +16225,9 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -15718,7 +16328,9 @@ define amdgpu_kernel void @local_wavefront_one_as_release_seq_cst_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -15735,7 +16347,9 @@ define amdgpu_kernel void @local_wavefront_one_as_release_seq_cst_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -15782,7 +16396,9 @@ define amdgpu_kernel void @local_wavefront_one_as_release_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -15798,7 +16414,9 @@ define amdgpu_kernel void @local_wavefront_one_as_release_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -15828,7 +16446,9 @@ define amdgpu_kernel void @local_wavefront_one_as_release_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -15929,7 +16549,9 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -15946,7 +16568,9 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -15993,7 +16617,9 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -16009,7 +16635,9 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -16039,7 +16667,9 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -16140,7 +16770,9 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -16157,7 +16789,9 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -16204,7 +16838,9 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -16220,7 +16856,9 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -16250,7 +16888,9 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-workgroup.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-workgroup.ll index 62d7f4801baf8..538995a09e1c2 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-workgroup.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-workgroup.ll @@ -366,7 +366,7 @@ define amdgpu_kernel void @local_workgroup_acquire_load( ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: ds_read_b32 v1, v0 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: ds_write_b32 v0, v1 @@ -380,7 +380,7 @@ define amdgpu_kernel void @local_workgroup_acquire_load( ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: ds_read_b32 v1, v0 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: ds_write_b32 v0, v1 @@ -419,7 +419,7 @@ define amdgpu_kernel void @local_workgroup_acquire_load( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: ds_read_b32 v1, v0 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 @@ -432,7 +432,7 @@ define amdgpu_kernel void @local_workgroup_acquire_load( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: ds_read_b32 v1, v0 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm @@ -457,7 +457,7 @@ define amdgpu_kernel void @local_workgroup_acquire_load( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: ds_read_b32 v1, v0 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm @@ -541,9 +541,9 @@ define amdgpu_kernel void @local_workgroup_seq_cst_load( ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: ds_read_b32 v1, v0 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: ds_write_b32 v0, v1 @@ -556,9 +556,9 @@ define amdgpu_kernel void @local_workgroup_seq_cst_load( ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: ds_read_b32 v1, v0 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: ds_write_b32 v0, v1 @@ -599,9 +599,9 @@ define amdgpu_kernel void @local_workgroup_seq_cst_load( ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_read_b32 v1, v0 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 @@ -613,9 +613,9 @@ define amdgpu_kernel void @local_workgroup_seq_cst_load( ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_read_b32 v1, v0 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm @@ -640,9 +640,9 @@ define amdgpu_kernel void @local_workgroup_seq_cst_load( ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_read_b32 v1, v0 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm @@ -1015,7 +1015,7 @@ define amdgpu_kernel void @local_workgroup_release_store( ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm ; @@ -1027,7 +1027,7 @@ define amdgpu_kernel void @local_workgroup_release_store( ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: v_mov_b32_e32 v1, s4 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: ds_write_b32 v0, v1 ; GFX7-NEXT: s_endpgm ; @@ -1062,7 +1062,7 @@ define amdgpu_kernel void @local_workgroup_release_store( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -1073,7 +1073,7 @@ define amdgpu_kernel void @local_workgroup_release_store( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -1095,7 +1095,7 @@ define amdgpu_kernel void @local_workgroup_release_store( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -1173,7 +1173,7 @@ define amdgpu_kernel void @local_workgroup_seq_cst_store( ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm ; @@ -1185,7 +1185,7 @@ define amdgpu_kernel void @local_workgroup_seq_cst_store( ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: v_mov_b32_e32 v1, s4 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: ds_write_b32 v0, v1 ; GFX7-NEXT: s_endpgm ; @@ -1220,7 +1220,7 @@ define amdgpu_kernel void @local_workgroup_seq_cst_store( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -1231,7 +1231,7 @@ define amdgpu_kernel void @local_workgroup_seq_cst_store( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -1253,7 +1253,7 @@ define amdgpu_kernel void @local_workgroup_seq_cst_store( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -1472,7 +1472,7 @@ define amdgpu_kernel void @local_workgroup_acquire_atomicrmw( ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_workgroup_acquire_atomicrmw: @@ -1484,7 +1484,7 @@ define amdgpu_kernel void @local_workgroup_acquire_atomicrmw( ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: v_mov_b32_e32 v1, s4 ; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_workgroup_acquire_atomicrmw: @@ -1519,7 +1519,7 @@ define amdgpu_kernel void @local_workgroup_acquire_atomicrmw( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_acquire_atomicrmw: @@ -1530,7 +1530,7 @@ define amdgpu_kernel void @local_workgroup_acquire_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_workgroup_acquire_atomicrmw: @@ -1552,7 +1552,7 @@ define amdgpu_kernel void @local_workgroup_acquire_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_workgroup_acquire_atomicrmw: @@ -1627,7 +1627,7 @@ define amdgpu_kernel void @local_workgroup_release_atomicrmw( ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX6-NEXT: s_endpgm ; @@ -1639,7 +1639,7 @@ define amdgpu_kernel void @local_workgroup_release_atomicrmw( ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: v_mov_b32_e32 v1, s4 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX7-NEXT: s_endpgm ; @@ -1674,7 +1674,7 @@ define amdgpu_kernel void @local_workgroup_release_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -1685,7 +1685,7 @@ define amdgpu_kernel void @local_workgroup_release_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -1707,7 +1707,7 @@ define amdgpu_kernel void @local_workgroup_release_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -1785,9 +1785,9 @@ define amdgpu_kernel void @local_workgroup_acq_rel_atomicrmw( ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_workgroup_acq_rel_atomicrmw: @@ -1798,9 +1798,9 @@ define amdgpu_kernel void @local_workgroup_acq_rel_atomicrmw( ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: v_mov_b32_e32 v1, s4 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_workgroup_acq_rel_atomicrmw: @@ -1837,9 +1837,9 @@ define amdgpu_kernel void @local_workgroup_acq_rel_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_acq_rel_atomicrmw: @@ -1849,9 +1849,9 @@ define amdgpu_kernel void @local_workgroup_acq_rel_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_workgroup_acq_rel_atomicrmw: @@ -1873,9 +1873,9 @@ define amdgpu_kernel void @local_workgroup_acq_rel_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_workgroup_acq_rel_atomicrmw: @@ -1959,9 +1959,9 @@ define amdgpu_kernel void @local_workgroup_seq_cst_atomicrmw( ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_workgroup_seq_cst_atomicrmw: @@ -1972,9 +1972,9 @@ define amdgpu_kernel void @local_workgroup_seq_cst_atomicrmw( ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: v_mov_b32_e32 v1, s4 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_workgroup_seq_cst_atomicrmw: @@ -2011,9 +2011,9 @@ define amdgpu_kernel void @local_workgroup_seq_cst_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_seq_cst_atomicrmw: @@ -2023,9 +2023,9 @@ define amdgpu_kernel void @local_workgroup_seq_cst_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_workgroup_seq_cst_atomicrmw: @@ -2047,9 +2047,9 @@ define amdgpu_kernel void @local_workgroup_seq_cst_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_workgroup_seq_cst_atomicrmw: @@ -2134,7 +2134,7 @@ define amdgpu_kernel void @local_workgroup_acquire_ret_atomicrmw( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: ds_write_b32 v0, v1 @@ -2149,7 +2149,7 @@ define amdgpu_kernel void @local_workgroup_acquire_ret_atomicrmw( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: ds_write_b32 v0, v1 @@ -2191,7 +2191,7 @@ define amdgpu_kernel void @local_workgroup_acquire_ret_atomicrmw( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 @@ -2205,7 +2205,7 @@ define amdgpu_kernel void @local_workgroup_acquire_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm @@ -2232,7 +2232,7 @@ define amdgpu_kernel void @local_workgroup_acquire_ret_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm @@ -2321,9 +2321,9 @@ define amdgpu_kernel void @local_workgroup_acq_rel_ret_atomicrmw( ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: ds_write_b32 v0, v1 @@ -2337,9 +2337,9 @@ define amdgpu_kernel void @local_workgroup_acq_rel_ret_atomicrmw( ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: ds_write_b32 v0, v1 @@ -2383,9 +2383,9 @@ define amdgpu_kernel void @local_workgroup_acq_rel_ret_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 @@ -2398,9 +2398,9 @@ define amdgpu_kernel void @local_workgroup_acq_rel_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm @@ -2427,9 +2427,9 @@ define amdgpu_kernel void @local_workgroup_acq_rel_ret_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm @@ -2527,9 +2527,9 @@ define amdgpu_kernel void @local_workgroup_seq_cst_ret_atomicrmw( ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: ds_write_b32 v0, v1 @@ -2543,9 +2543,9 @@ define amdgpu_kernel void @local_workgroup_seq_cst_ret_atomicrmw( ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: ds_write_b32 v0, v1 @@ -2589,9 +2589,9 @@ define amdgpu_kernel void @local_workgroup_seq_cst_ret_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 @@ -2604,9 +2604,9 @@ define amdgpu_kernel void @local_workgroup_seq_cst_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm @@ -2633,9 +2633,9 @@ define amdgpu_kernel void @local_workgroup_seq_cst_ret_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm @@ -2903,7 +2903,7 @@ define amdgpu_kernel void @local_workgroup_acquire_monotonic_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_workgroup_acquire_monotonic_cmpxchg: @@ -2917,7 +2917,7 @@ define amdgpu_kernel void @local_workgroup_acquire_monotonic_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_workgroup_acquire_monotonic_cmpxchg: @@ -2958,7 +2958,7 @@ define amdgpu_kernel void @local_workgroup_acquire_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_acquire_monotonic_cmpxchg: @@ -2971,7 +2971,7 @@ define amdgpu_kernel void @local_workgroup_acquire_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_workgroup_acquire_monotonic_cmpxchg: @@ -2997,7 +2997,7 @@ define amdgpu_kernel void @local_workgroup_acquire_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_workgroup_acquire_monotonic_cmpxchg: @@ -3085,7 +3085,7 @@ define amdgpu_kernel void @local_workgroup_release_monotonic_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX6-NEXT: s_endpgm ; @@ -3099,7 +3099,7 @@ define amdgpu_kernel void @local_workgroup_release_monotonic_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX7-NEXT: s_endpgm ; @@ -3140,7 +3140,7 @@ define amdgpu_kernel void @local_workgroup_release_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -3153,7 +3153,7 @@ define amdgpu_kernel void @local_workgroup_release_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -3179,7 +3179,7 @@ define amdgpu_kernel void @local_workgroup_release_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -3270,9 +3270,9 @@ define amdgpu_kernel void @local_workgroup_acq_rel_monotonic_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_workgroup_acq_rel_monotonic_cmpxchg: @@ -3285,9 +3285,9 @@ define amdgpu_kernel void @local_workgroup_acq_rel_monotonic_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_workgroup_acq_rel_monotonic_cmpxchg: @@ -3330,9 +3330,9 @@ define amdgpu_kernel void @local_workgroup_acq_rel_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_acq_rel_monotonic_cmpxchg: @@ -3344,9 +3344,9 @@ define amdgpu_kernel void @local_workgroup_acq_rel_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_workgroup_acq_rel_monotonic_cmpxchg: @@ -3372,9 +3372,9 @@ define amdgpu_kernel void @local_workgroup_acq_rel_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_workgroup_acq_rel_monotonic_cmpxchg: @@ -3471,9 +3471,9 @@ define amdgpu_kernel void @local_workgroup_seq_cst_monotonic_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_workgroup_seq_cst_monotonic_cmpxchg: @@ -3486,9 +3486,9 @@ define amdgpu_kernel void @local_workgroup_seq_cst_monotonic_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_workgroup_seq_cst_monotonic_cmpxchg: @@ -3531,9 +3531,9 @@ define amdgpu_kernel void @local_workgroup_seq_cst_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_seq_cst_monotonic_cmpxchg: @@ -3545,9 +3545,9 @@ define amdgpu_kernel void @local_workgroup_seq_cst_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_workgroup_seq_cst_monotonic_cmpxchg: @@ -3573,9 +3573,9 @@ define amdgpu_kernel void @local_workgroup_seq_cst_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_workgroup_seq_cst_monotonic_cmpxchg: @@ -3673,7 +3673,7 @@ define amdgpu_kernel void @local_workgroup_monotonic_acquire_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_workgroup_monotonic_acquire_cmpxchg: @@ -3687,7 +3687,7 @@ define amdgpu_kernel void @local_workgroup_monotonic_acquire_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_workgroup_monotonic_acquire_cmpxchg: @@ -3728,7 +3728,7 @@ define amdgpu_kernel void @local_workgroup_monotonic_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_monotonic_acquire_cmpxchg: @@ -3741,7 +3741,7 @@ define amdgpu_kernel void @local_workgroup_monotonic_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_workgroup_monotonic_acquire_cmpxchg: @@ -3767,7 +3767,7 @@ define amdgpu_kernel void @local_workgroup_monotonic_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_workgroup_monotonic_acquire_cmpxchg: @@ -3856,7 +3856,7 @@ define amdgpu_kernel void @local_workgroup_acquire_acquire_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_workgroup_acquire_acquire_cmpxchg: @@ -3870,7 +3870,7 @@ define amdgpu_kernel void @local_workgroup_acquire_acquire_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_workgroup_acquire_acquire_cmpxchg: @@ -3911,7 +3911,7 @@ define amdgpu_kernel void @local_workgroup_acquire_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_acquire_acquire_cmpxchg: @@ -3924,7 +3924,7 @@ define amdgpu_kernel void @local_workgroup_acquire_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_workgroup_acquire_acquire_cmpxchg: @@ -3950,7 +3950,7 @@ define amdgpu_kernel void @local_workgroup_acquire_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_workgroup_acquire_acquire_cmpxchg: @@ -4038,9 +4038,9 @@ define amdgpu_kernel void @local_workgroup_release_acquire_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_workgroup_release_acquire_cmpxchg: @@ -4053,9 +4053,9 @@ define amdgpu_kernel void @local_workgroup_release_acquire_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_workgroup_release_acquire_cmpxchg: @@ -4098,9 +4098,9 @@ define amdgpu_kernel void @local_workgroup_release_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_release_acquire_cmpxchg: @@ -4112,9 +4112,9 @@ define amdgpu_kernel void @local_workgroup_release_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_workgroup_release_acquire_cmpxchg: @@ -4140,9 +4140,9 @@ define amdgpu_kernel void @local_workgroup_release_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_workgroup_release_acquire_cmpxchg: @@ -4239,9 +4239,9 @@ define amdgpu_kernel void @local_workgroup_acq_rel_acquire_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_workgroup_acq_rel_acquire_cmpxchg: @@ -4254,9 +4254,9 @@ define amdgpu_kernel void @local_workgroup_acq_rel_acquire_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_workgroup_acq_rel_acquire_cmpxchg: @@ -4299,9 +4299,9 @@ define amdgpu_kernel void @local_workgroup_acq_rel_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_acq_rel_acquire_cmpxchg: @@ -4313,9 +4313,9 @@ define amdgpu_kernel void @local_workgroup_acq_rel_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_workgroup_acq_rel_acquire_cmpxchg: @@ -4341,9 +4341,9 @@ define amdgpu_kernel void @local_workgroup_acq_rel_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_workgroup_acq_rel_acquire_cmpxchg: @@ -4440,9 +4440,9 @@ define amdgpu_kernel void @local_workgroup_seq_cst_acquire_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_workgroup_seq_cst_acquire_cmpxchg: @@ -4455,9 +4455,9 @@ define amdgpu_kernel void @local_workgroup_seq_cst_acquire_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_workgroup_seq_cst_acquire_cmpxchg: @@ -4500,9 +4500,9 @@ define amdgpu_kernel void @local_workgroup_seq_cst_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_seq_cst_acquire_cmpxchg: @@ -4514,9 +4514,9 @@ define amdgpu_kernel void @local_workgroup_seq_cst_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_workgroup_seq_cst_acquire_cmpxchg: @@ -4542,9 +4542,9 @@ define amdgpu_kernel void @local_workgroup_seq_cst_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_workgroup_seq_cst_acquire_cmpxchg: @@ -4641,9 +4641,9 @@ define amdgpu_kernel void @local_workgroup_monotonic_seq_cst_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_workgroup_monotonic_seq_cst_cmpxchg: @@ -4656,9 +4656,9 @@ define amdgpu_kernel void @local_workgroup_monotonic_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_workgroup_monotonic_seq_cst_cmpxchg: @@ -4701,9 +4701,9 @@ define amdgpu_kernel void @local_workgroup_monotonic_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_monotonic_seq_cst_cmpxchg: @@ -4715,9 +4715,9 @@ define amdgpu_kernel void @local_workgroup_monotonic_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_workgroup_monotonic_seq_cst_cmpxchg: @@ -4743,9 +4743,9 @@ define amdgpu_kernel void @local_workgroup_monotonic_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_workgroup_monotonic_seq_cst_cmpxchg: @@ -4842,9 +4842,9 @@ define amdgpu_kernel void @local_workgroup_acquire_seq_cst_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_workgroup_acquire_seq_cst_cmpxchg: @@ -4857,9 +4857,9 @@ define amdgpu_kernel void @local_workgroup_acquire_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_workgroup_acquire_seq_cst_cmpxchg: @@ -4902,9 +4902,9 @@ define amdgpu_kernel void @local_workgroup_acquire_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_acquire_seq_cst_cmpxchg: @@ -4916,9 +4916,9 @@ define amdgpu_kernel void @local_workgroup_acquire_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_workgroup_acquire_seq_cst_cmpxchg: @@ -4944,9 +4944,9 @@ define amdgpu_kernel void @local_workgroup_acquire_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_workgroup_acquire_seq_cst_cmpxchg: @@ -5043,9 +5043,9 @@ define amdgpu_kernel void @local_workgroup_release_seq_cst_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_workgroup_release_seq_cst_cmpxchg: @@ -5058,9 +5058,9 @@ define amdgpu_kernel void @local_workgroup_release_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_workgroup_release_seq_cst_cmpxchg: @@ -5103,9 +5103,9 @@ define amdgpu_kernel void @local_workgroup_release_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_release_seq_cst_cmpxchg: @@ -5117,9 +5117,9 @@ define amdgpu_kernel void @local_workgroup_release_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_workgroup_release_seq_cst_cmpxchg: @@ -5145,9 +5145,9 @@ define amdgpu_kernel void @local_workgroup_release_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_workgroup_release_seq_cst_cmpxchg: @@ -5244,9 +5244,9 @@ define amdgpu_kernel void @local_workgroup_acq_rel_seq_cst_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_workgroup_acq_rel_seq_cst_cmpxchg: @@ -5259,9 +5259,9 @@ define amdgpu_kernel void @local_workgroup_acq_rel_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_workgroup_acq_rel_seq_cst_cmpxchg: @@ -5304,9 +5304,9 @@ define amdgpu_kernel void @local_workgroup_acq_rel_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_acq_rel_seq_cst_cmpxchg: @@ -5318,9 +5318,9 @@ define amdgpu_kernel void @local_workgroup_acq_rel_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_workgroup_acq_rel_seq_cst_cmpxchg: @@ -5346,9 +5346,9 @@ define amdgpu_kernel void @local_workgroup_acq_rel_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_workgroup_acq_rel_seq_cst_cmpxchg: @@ -5445,9 +5445,9 @@ define amdgpu_kernel void @local_workgroup_seq_cst_seq_cst_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_workgroup_seq_cst_seq_cst_cmpxchg: @@ -5460,9 +5460,9 @@ define amdgpu_kernel void @local_workgroup_seq_cst_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_workgroup_seq_cst_seq_cst_cmpxchg: @@ -5505,9 +5505,9 @@ define amdgpu_kernel void @local_workgroup_seq_cst_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_seq_cst_seq_cst_cmpxchg: @@ -5519,9 +5519,9 @@ define amdgpu_kernel void @local_workgroup_seq_cst_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_workgroup_seq_cst_seq_cst_cmpxchg: @@ -5547,9 +5547,9 @@ define amdgpu_kernel void @local_workgroup_seq_cst_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_workgroup_seq_cst_seq_cst_cmpxchg: @@ -5858,7 +5858,7 @@ define amdgpu_kernel void @local_workgroup_acquire_monotonic_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: ds_write_b32 v0, v1 @@ -5875,7 +5875,7 @@ define amdgpu_kernel void @local_workgroup_acquire_monotonic_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: ds_write_b32 v0, v1 @@ -5923,7 +5923,7 @@ define amdgpu_kernel void @local_workgroup_acquire_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 @@ -5939,7 +5939,7 @@ define amdgpu_kernel void @local_workgroup_acquire_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm @@ -5970,7 +5970,7 @@ define amdgpu_kernel void @local_workgroup_acquire_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm @@ -6073,7 +6073,7 @@ define amdgpu_kernel void @local_workgroup_release_monotonic_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 @@ -6091,7 +6091,7 @@ define amdgpu_kernel void @local_workgroup_release_monotonic_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -6142,7 +6142,7 @@ define amdgpu_kernel void @local_workgroup_release_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 @@ -6159,7 +6159,7 @@ define amdgpu_kernel void @local_workgroup_release_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -6191,7 +6191,7 @@ define amdgpu_kernel void @local_workgroup_release_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -6302,9 +6302,9 @@ define amdgpu_kernel void @local_workgroup_acq_rel_monotonic_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: ds_write_b32 v0, v1 @@ -6320,9 +6320,9 @@ define amdgpu_kernel void @local_workgroup_acq_rel_monotonic_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: ds_write_b32 v0, v1 @@ -6372,9 +6372,9 @@ define amdgpu_kernel void @local_workgroup_acq_rel_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 @@ -6389,9 +6389,9 @@ define amdgpu_kernel void @local_workgroup_acq_rel_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm @@ -6422,9 +6422,9 @@ define amdgpu_kernel void @local_workgroup_acq_rel_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm @@ -6536,9 +6536,9 @@ define amdgpu_kernel void @local_workgroup_seq_cst_monotonic_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: ds_write_b32 v0, v1 @@ -6554,9 +6554,9 @@ define amdgpu_kernel void @local_workgroup_seq_cst_monotonic_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: ds_write_b32 v0, v1 @@ -6606,9 +6606,9 @@ define amdgpu_kernel void @local_workgroup_seq_cst_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 @@ -6623,9 +6623,9 @@ define amdgpu_kernel void @local_workgroup_seq_cst_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm @@ -6656,9 +6656,9 @@ define amdgpu_kernel void @local_workgroup_seq_cst_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm @@ -6771,7 +6771,7 @@ define amdgpu_kernel void @local_workgroup_monotonic_acquire_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: ds_write_b32 v0, v1 @@ -6788,7 +6788,7 @@ define amdgpu_kernel void @local_workgroup_monotonic_acquire_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: ds_write_b32 v0, v1 @@ -6836,7 +6836,7 @@ define amdgpu_kernel void @local_workgroup_monotonic_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 @@ -6852,7 +6852,7 @@ define amdgpu_kernel void @local_workgroup_monotonic_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm @@ -6883,7 +6883,7 @@ define amdgpu_kernel void @local_workgroup_monotonic_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm @@ -6987,7 +6987,7 @@ define amdgpu_kernel void @local_workgroup_acquire_acquire_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: ds_write_b32 v0, v1 @@ -7004,7 +7004,7 @@ define amdgpu_kernel void @local_workgroup_acquire_acquire_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: ds_write_b32 v0, v1 @@ -7052,7 +7052,7 @@ define amdgpu_kernel void @local_workgroup_acquire_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 @@ -7068,7 +7068,7 @@ define amdgpu_kernel void @local_workgroup_acquire_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm @@ -7099,7 +7099,7 @@ define amdgpu_kernel void @local_workgroup_acquire_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm @@ -7202,9 +7202,9 @@ define amdgpu_kernel void @local_workgroup_release_acquire_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: ds_write_b32 v0, v1 @@ -7220,9 +7220,9 @@ define amdgpu_kernel void @local_workgroup_release_acquire_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: ds_write_b32 v0, v1 @@ -7272,9 +7272,9 @@ define amdgpu_kernel void @local_workgroup_release_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 @@ -7289,9 +7289,9 @@ define amdgpu_kernel void @local_workgroup_release_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm @@ -7322,9 +7322,9 @@ define amdgpu_kernel void @local_workgroup_release_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm @@ -7436,9 +7436,9 @@ define amdgpu_kernel void @local_workgroup_acq_rel_acquire_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: ds_write_b32 v0, v1 @@ -7454,9 +7454,9 @@ define amdgpu_kernel void @local_workgroup_acq_rel_acquire_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: ds_write_b32 v0, v1 @@ -7506,9 +7506,9 @@ define amdgpu_kernel void @local_workgroup_acq_rel_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 @@ -7523,9 +7523,9 @@ define amdgpu_kernel void @local_workgroup_acq_rel_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm @@ -7556,9 +7556,9 @@ define amdgpu_kernel void @local_workgroup_acq_rel_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm @@ -7670,9 +7670,9 @@ define amdgpu_kernel void @local_workgroup_seq_cst_acquire_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: ds_write_b32 v0, v1 @@ -7688,9 +7688,9 @@ define amdgpu_kernel void @local_workgroup_seq_cst_acquire_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: ds_write_b32 v0, v1 @@ -7740,9 +7740,9 @@ define amdgpu_kernel void @local_workgroup_seq_cst_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 @@ -7757,9 +7757,9 @@ define amdgpu_kernel void @local_workgroup_seq_cst_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm @@ -7790,9 +7790,9 @@ define amdgpu_kernel void @local_workgroup_seq_cst_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm @@ -7904,9 +7904,9 @@ define amdgpu_kernel void @local_workgroup_monotonic_seq_cst_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: ds_write_b32 v0, v1 @@ -7922,9 +7922,9 @@ define amdgpu_kernel void @local_workgroup_monotonic_seq_cst_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: ds_write_b32 v0, v1 @@ -7974,9 +7974,9 @@ define amdgpu_kernel void @local_workgroup_monotonic_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 @@ -7991,9 +7991,9 @@ define amdgpu_kernel void @local_workgroup_monotonic_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm @@ -8024,9 +8024,9 @@ define amdgpu_kernel void @local_workgroup_monotonic_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm @@ -8138,9 +8138,9 @@ define amdgpu_kernel void @local_workgroup_acquire_seq_cst_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: ds_write_b32 v0, v1 @@ -8156,9 +8156,9 @@ define amdgpu_kernel void @local_workgroup_acquire_seq_cst_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: ds_write_b32 v0, v1 @@ -8208,9 +8208,9 @@ define amdgpu_kernel void @local_workgroup_acquire_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 @@ -8225,9 +8225,9 @@ define amdgpu_kernel void @local_workgroup_acquire_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm @@ -8258,9 +8258,9 @@ define amdgpu_kernel void @local_workgroup_acquire_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm @@ -8372,9 +8372,9 @@ define amdgpu_kernel void @local_workgroup_release_seq_cst_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: ds_write_b32 v0, v1 @@ -8390,9 +8390,9 @@ define amdgpu_kernel void @local_workgroup_release_seq_cst_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: ds_write_b32 v0, v1 @@ -8442,9 +8442,9 @@ define amdgpu_kernel void @local_workgroup_release_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 @@ -8459,9 +8459,9 @@ define amdgpu_kernel void @local_workgroup_release_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm @@ -8492,9 +8492,9 @@ define amdgpu_kernel void @local_workgroup_release_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm @@ -8606,9 +8606,9 @@ define amdgpu_kernel void @local_workgroup_acq_rel_seq_cst_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: ds_write_b32 v0, v1 @@ -8624,9 +8624,9 @@ define amdgpu_kernel void @local_workgroup_acq_rel_seq_cst_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: ds_write_b32 v0, v1 @@ -8676,9 +8676,9 @@ define amdgpu_kernel void @local_workgroup_acq_rel_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 @@ -8693,9 +8693,9 @@ define amdgpu_kernel void @local_workgroup_acq_rel_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm @@ -8726,9 +8726,9 @@ define amdgpu_kernel void @local_workgroup_acq_rel_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm @@ -8840,9 +8840,9 @@ define amdgpu_kernel void @local_workgroup_seq_cst_seq_cst_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: ds_write_b32 v0, v1 @@ -8858,9 +8858,9 @@ define amdgpu_kernel void @local_workgroup_seq_cst_seq_cst_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: ds_write_b32 v0, v1 @@ -8910,9 +8910,9 @@ define amdgpu_kernel void @local_workgroup_seq_cst_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 @@ -8927,9 +8927,9 @@ define amdgpu_kernel void @local_workgroup_seq_cst_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm @@ -8960,9 +8960,9 @@ define amdgpu_kernel void @local_workgroup_seq_cst_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm @@ -9415,6 +9415,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_load( ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: ds_read_b32 v1, v0 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -9429,6 +9430,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_load( ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: ds_read_b32 v1, v0 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -9467,6 +9469,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_load( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: ds_read_b32 v1, v0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -9480,6 +9483,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_load( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: ds_read_b32 v1, v0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -9504,6 +9508,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_load( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: ds_read_b32 v1, v0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -9585,7 +9590,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_load( ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_read_b32 v1, v0 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -9599,7 +9606,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_load( ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_read_b32 v1, v0 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -9637,7 +9646,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_load( ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_read_b32 v1, v0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -9650,7 +9661,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_load( ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_read_b32 v1, v0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -9674,7 +9687,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_load( ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_read_b32 v1, v0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -10036,6 +10051,7 @@ define amdgpu_kernel void @local_workgroup_one_as_release_store( ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm ; @@ -10047,6 +10063,7 @@ define amdgpu_kernel void @local_workgroup_one_as_release_store( ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: v_mov_b32_e32 v1, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_write_b32 v0, v1 ; GFX7-NEXT: s_endpgm ; @@ -10078,6 +10095,7 @@ define amdgpu_kernel void @local_workgroup_one_as_release_store( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -10088,6 +10106,7 @@ define amdgpu_kernel void @local_workgroup_one_as_release_store( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -10108,6 +10127,7 @@ define amdgpu_kernel void @local_workgroup_one_as_release_store( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -10176,6 +10196,7 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_store( ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm ; @@ -10187,6 +10208,7 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_store( ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: v_mov_b32_e32 v1, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_write_b32 v0, v1 ; GFX7-NEXT: s_endpgm ; @@ -10218,6 +10240,7 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_store( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -10228,6 +10251,7 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_store( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -10248,6 +10272,7 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_store( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -10457,6 +10482,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_atomicrmw( ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_workgroup_one_as_acquire_atomicrmw: @@ -10468,6 +10494,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_atomicrmw( ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: v_mov_b32_e32 v1, s4 ; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_workgroup_one_as_acquire_atomicrmw: @@ -10499,6 +10526,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_atomicrmw( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_acquire_atomicrmw: @@ -10509,6 +10537,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_acquire_atomicrmw: @@ -10529,6 +10558,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_workgroup_one_as_acquire_atomicrmw: @@ -10596,6 +10626,7 @@ define amdgpu_kernel void @local_workgroup_one_as_release_atomicrmw( ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX6-NEXT: s_endpgm ; @@ -10607,6 +10638,7 @@ define amdgpu_kernel void @local_workgroup_one_as_release_atomicrmw( ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: v_mov_b32_e32 v1, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX7-NEXT: s_endpgm ; @@ -10638,6 +10670,7 @@ define amdgpu_kernel void @local_workgroup_one_as_release_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -10648,6 +10681,7 @@ define amdgpu_kernel void @local_workgroup_one_as_release_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -10668,6 +10702,7 @@ define amdgpu_kernel void @local_workgroup_one_as_release_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -10736,7 +10771,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_atomicrmw( ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_workgroup_one_as_acq_rel_atomicrmw: @@ -10747,7 +10784,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_atomicrmw( ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: v_mov_b32_e32 v1, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_workgroup_one_as_acq_rel_atomicrmw: @@ -10778,7 +10817,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_acq_rel_atomicrmw: @@ -10788,7 +10829,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_acq_rel_atomicrmw: @@ -10808,7 +10851,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_workgroup_one_as_acq_rel_atomicrmw: @@ -10876,7 +10921,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_atomicrmw( ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_workgroup_one_as_seq_cst_atomicrmw: @@ -10887,7 +10934,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_atomicrmw( ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: v_mov_b32_e32 v1, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_workgroup_one_as_seq_cst_atomicrmw: @@ -10918,7 +10967,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_seq_cst_atomicrmw: @@ -10928,7 +10979,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_seq_cst_atomicrmw: @@ -10948,7 +11001,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_workgroup_one_as_seq_cst_atomicrmw: @@ -11017,6 +11072,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_ret_atomicrmw( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -11032,6 +11088,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_ret_atomicrmw( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -11073,6 +11130,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_ret_atomicrmw( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -11087,6 +11145,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -11113,6 +11172,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_ret_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -11199,7 +11259,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_ret_atomicrmw( ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -11214,7 +11276,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_ret_atomicrmw( ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -11255,7 +11319,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_ret_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -11269,7 +11335,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -11295,7 +11363,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_ret_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -11382,7 +11452,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_ret_atomicrmw( ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -11397,7 +11469,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_ret_atomicrmw( ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -11438,7 +11512,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_ret_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -11452,7 +11528,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -11478,7 +11556,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_ret_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -11735,6 +11815,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_monotonic_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_workgroup_one_as_acquire_monotonic_cmpxchg: @@ -11748,6 +11829,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_monotonic_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_workgroup_one_as_acquire_monotonic_cmpxchg: @@ -11785,6 +11867,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_acquire_monotonic_cmpxchg: @@ -11797,6 +11880,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_acquire_monotonic_cmpxchg: @@ -11821,6 +11905,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_workgroup_one_as_acquire_monotonic_cmpxchg: @@ -11901,6 +11986,7 @@ define amdgpu_kernel void @local_workgroup_one_as_release_monotonic_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX6-NEXT: s_endpgm ; @@ -11914,6 +12000,7 @@ define amdgpu_kernel void @local_workgroup_one_as_release_monotonic_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX7-NEXT: s_endpgm ; @@ -11951,6 +12038,7 @@ define amdgpu_kernel void @local_workgroup_one_as_release_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -11963,6 +12051,7 @@ define amdgpu_kernel void @local_workgroup_one_as_release_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -11987,6 +12076,7 @@ define amdgpu_kernel void @local_workgroup_one_as_release_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -12068,7 +12158,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_monotonic_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_workgroup_one_as_acq_rel_monotonic_cmpxchg: @@ -12081,7 +12173,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_monotonic_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_workgroup_one_as_acq_rel_monotonic_cmpxchg: @@ -12118,7 +12212,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_acq_rel_monotonic_cmpxchg: @@ -12130,7 +12226,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_acq_rel_monotonic_cmpxchg: @@ -12154,7 +12252,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_workgroup_one_as_acq_rel_monotonic_cmpxchg: @@ -12235,7 +12335,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_monotonic_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_workgroup_one_as_seq_cst_monotonic_cmpxchg: @@ -12248,7 +12350,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_monotonic_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_workgroup_one_as_seq_cst_monotonic_cmpxchg: @@ -12285,7 +12389,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_seq_cst_monotonic_cmpxchg: @@ -12297,7 +12403,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_seq_cst_monotonic_cmpxchg: @@ -12321,7 +12429,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_workgroup_one_as_seq_cst_monotonic_cmpxchg: @@ -12403,6 +12513,7 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_acquire_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_workgroup_one_as_monotonic_acquire_cmpxchg: @@ -12416,6 +12527,7 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_acquire_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_workgroup_one_as_monotonic_acquire_cmpxchg: @@ -12453,6 +12565,7 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_monotonic_acquire_cmpxchg: @@ -12465,6 +12578,7 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_monotonic_acquire_cmpxchg: @@ -12489,6 +12603,7 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_workgroup_one_as_monotonic_acquire_cmpxchg: @@ -12570,6 +12685,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_acquire_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_workgroup_one_as_acquire_acquire_cmpxchg: @@ -12583,6 +12699,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_acquire_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_workgroup_one_as_acquire_acquire_cmpxchg: @@ -12620,6 +12737,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_acquire_acquire_cmpxchg: @@ -12632,6 +12750,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_acquire_acquire_cmpxchg: @@ -12656,6 +12775,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_workgroup_one_as_acquire_acquire_cmpxchg: @@ -12736,7 +12856,9 @@ define amdgpu_kernel void @local_workgroup_one_as_release_acquire_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_workgroup_one_as_release_acquire_cmpxchg: @@ -12749,7 +12871,9 @@ define amdgpu_kernel void @local_workgroup_one_as_release_acquire_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_workgroup_one_as_release_acquire_cmpxchg: @@ -12786,7 +12910,9 @@ define amdgpu_kernel void @local_workgroup_one_as_release_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_release_acquire_cmpxchg: @@ -12798,7 +12924,9 @@ define amdgpu_kernel void @local_workgroup_one_as_release_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_release_acquire_cmpxchg: @@ -12822,7 +12950,9 @@ define amdgpu_kernel void @local_workgroup_one_as_release_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_workgroup_one_as_release_acquire_cmpxchg: @@ -12903,7 +13033,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_acquire_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_workgroup_one_as_acq_rel_acquire_cmpxchg: @@ -12916,7 +13048,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_acquire_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_workgroup_one_as_acq_rel_acquire_cmpxchg: @@ -12953,7 +13087,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_acq_rel_acquire_cmpxchg: @@ -12965,7 +13101,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_acq_rel_acquire_cmpxchg: @@ -12989,7 +13127,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_workgroup_one_as_acq_rel_acquire_cmpxchg: @@ -13070,7 +13210,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_acquire_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_workgroup_one_as_seq_cst_acquire_cmpxchg: @@ -13083,7 +13225,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_acquire_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_workgroup_one_as_seq_cst_acquire_cmpxchg: @@ -13120,7 +13264,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_seq_cst_acquire_cmpxchg: @@ -13132,7 +13278,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_seq_cst_acquire_cmpxchg: @@ -13156,7 +13304,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_workgroup_one_as_seq_cst_acquire_cmpxchg: @@ -13237,7 +13387,9 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_seq_cst_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_workgroup_one_as_monotonic_seq_cst_cmpxchg: @@ -13250,7 +13402,9 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_workgroup_one_as_monotonic_seq_cst_cmpxchg: @@ -13287,7 +13441,9 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_monotonic_seq_cst_cmpxchg: @@ -13299,7 +13455,9 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_monotonic_seq_cst_cmpxchg: @@ -13323,7 +13481,9 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_workgroup_one_as_monotonic_seq_cst_cmpxchg: @@ -13404,7 +13564,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_seq_cst_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_workgroup_one_as_acquire_seq_cst_cmpxchg: @@ -13417,7 +13579,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_workgroup_one_as_acquire_seq_cst_cmpxchg: @@ -13454,7 +13618,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_acquire_seq_cst_cmpxchg: @@ -13466,7 +13632,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_acquire_seq_cst_cmpxchg: @@ -13490,7 +13658,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_workgroup_one_as_acquire_seq_cst_cmpxchg: @@ -13571,7 +13741,9 @@ define amdgpu_kernel void @local_workgroup_one_as_release_seq_cst_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_workgroup_one_as_release_seq_cst_cmpxchg: @@ -13584,7 +13756,9 @@ define amdgpu_kernel void @local_workgroup_one_as_release_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_workgroup_one_as_release_seq_cst_cmpxchg: @@ -13621,7 +13795,9 @@ define amdgpu_kernel void @local_workgroup_one_as_release_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_release_seq_cst_cmpxchg: @@ -13633,7 +13809,9 @@ define amdgpu_kernel void @local_workgroup_one_as_release_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_release_seq_cst_cmpxchg: @@ -13657,7 +13835,9 @@ define amdgpu_kernel void @local_workgroup_one_as_release_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_workgroup_one_as_release_seq_cst_cmpxchg: @@ -13738,7 +13918,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_seq_cst_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_workgroup_one_as_acq_rel_seq_cst_cmpxchg: @@ -13751,7 +13933,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_workgroup_one_as_acq_rel_seq_cst_cmpxchg: @@ -13788,7 +13972,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_acq_rel_seq_cst_cmpxchg: @@ -13800,7 +13986,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_acq_rel_seq_cst_cmpxchg: @@ -13824,7 +14012,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_workgroup_one_as_acq_rel_seq_cst_cmpxchg: @@ -13905,7 +14095,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_seq_cst_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_workgroup_one_as_seq_cst_seq_cst_cmpxchg: @@ -13918,7 +14110,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_workgroup_one_as_seq_cst_seq_cst_cmpxchg: @@ -13955,7 +14149,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_seq_cst_seq_cst_cmpxchg: @@ -13967,7 +14163,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_seq_cst_seq_cst_cmpxchg: @@ -13991,7 +14189,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_workgroup_one_as_seq_cst_seq_cst_cmpxchg: @@ -14284,6 +14484,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_monotonic_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -14301,6 +14502,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_monotonic_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -14348,6 +14550,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -14364,6 +14567,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -14394,6 +14598,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -14494,6 +14699,7 @@ define amdgpu_kernel void @local_workgroup_one_as_release_monotonic_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 @@ -14511,6 +14717,7 @@ define amdgpu_kernel void @local_workgroup_one_as_release_monotonic_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -14558,6 +14765,7 @@ define amdgpu_kernel void @local_workgroup_one_as_release_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 @@ -14574,6 +14782,7 @@ define amdgpu_kernel void @local_workgroup_one_as_release_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -14604,6 +14813,7 @@ define amdgpu_kernel void @local_workgroup_one_as_release_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -14705,7 +14915,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -14722,7 +14934,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -14769,7 +14983,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -14785,7 +15001,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -14815,7 +15033,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -14916,7 +15136,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -14933,7 +15155,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -14980,7 +15204,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -14996,7 +15222,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -15026,7 +15254,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -15128,6 +15358,7 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_acquire_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -15145,6 +15376,7 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_acquire_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -15192,6 +15424,7 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -15208,6 +15441,7 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -15238,6 +15472,7 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -15339,6 +15574,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_acquire_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -15356,6 +15592,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_acquire_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -15403,6 +15640,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -15419,6 +15657,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -15449,6 +15688,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -15549,7 +15789,9 @@ define amdgpu_kernel void @local_workgroup_one_as_release_acquire_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -15566,7 +15808,9 @@ define amdgpu_kernel void @local_workgroup_one_as_release_acquire_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -15613,7 +15857,9 @@ define amdgpu_kernel void @local_workgroup_one_as_release_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -15629,7 +15875,9 @@ define amdgpu_kernel void @local_workgroup_one_as_release_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -15659,7 +15907,9 @@ define amdgpu_kernel void @local_workgroup_one_as_release_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -15760,7 +16010,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -15777,7 +16029,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -15824,7 +16078,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -15840,7 +16096,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -15870,7 +16128,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -15971,7 +16231,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -15988,7 +16250,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -16035,7 +16299,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -16051,7 +16317,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -16081,7 +16349,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -16182,7 +16452,9 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -16199,7 +16471,9 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -16246,7 +16520,9 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -16262,7 +16538,9 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -16292,7 +16570,9 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -16393,7 +16673,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -16410,7 +16692,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -16457,7 +16741,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -16473,7 +16759,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -16503,7 +16791,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -16604,7 +16894,9 @@ define amdgpu_kernel void @local_workgroup_one_as_release_seq_cst_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -16621,7 +16913,9 @@ define amdgpu_kernel void @local_workgroup_one_as_release_seq_cst_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -16668,7 +16962,9 @@ define amdgpu_kernel void @local_workgroup_one_as_release_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -16684,7 +16980,9 @@ define amdgpu_kernel void @local_workgroup_one_as_release_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -16714,7 +17012,9 @@ define amdgpu_kernel void @local_workgroup_one_as_release_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -16815,7 +17115,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -16832,7 +17134,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -16879,7 +17183,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -16895,7 +17201,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -16925,7 +17233,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -17026,7 +17336,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -17043,7 +17355,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -17090,7 +17404,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -17106,7 +17422,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -17136,7 +17454,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local.mir b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local.mir index adaee7ebaddd3..7ab8a51aef68d 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local.mir +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local.mir @@ -1,18 +1,23 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5 # RUN: llc -mtriple=amdgcn -mcpu=gfx803 -run-pass si-memory-legalizer %s -o - | FileCheck -check-prefix=GCN %s --- -# GCN-LABEL: name: load_singlethread_unordered -# GCN-LABEL: bb.0: -# GCN-NOT: S_WAITCNT -# GCN: DS_READ_B32 -# GCN-NOT: S_WAITCNT -# GCN: FLAT_STORE_DWORD name: load_singlethread_unordered body: | bb.0: + ; GCN-LABEL: name: load_singlethread_unordered + ; GCN: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) + ; GCN-NEXT: $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load (s64) from `ptr addrspace(4) poison`, align 4, addrspace 4) + ; GCN-NEXT: $m0 = S_MOV_B32 -1 + ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GCN-NEXT: renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("singlethread-one-as") unordered (s32) from `ptr addrspace(3) poison`, addrspace 3) + ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 + ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec + ; GCN-NEXT: FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `ptr poison`) + ; GCN-NEXT: S_ENDPGM 0 $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load (s64) from `ptr addrspace(4) poison`, align 4, addrspace 4) $m0 = S_MOV_B32 -1 @@ -26,17 +31,21 @@ body: | ... --- -# GCN-LABEL: name: load_singlethread_monotonic -# GCN-LABEL: bb.0: -# GCN-NOT: S_WAITCNT -# GCN: DS_READ_B32 -# GCN-NOT: S_WAITCNT -# GCN: FLAT_STORE_DWORD name: load_singlethread_monotonic body: | bb.0: + ; GCN-LABEL: name: load_singlethread_monotonic + ; GCN: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) + ; GCN-NEXT: $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load (s64) from `ptr addrspace(4) poison`, align 4, addrspace 4) + ; GCN-NEXT: $m0 = S_MOV_B32 -1 + ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GCN-NEXT: renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("singlethread-one-as") monotonic (s32) from `ptr addrspace(3) poison`, addrspace 3) + ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 + ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec + ; GCN-NEXT: FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `ptr poison`) + ; GCN-NEXT: S_ENDPGM 0 $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load (s64) from `ptr addrspace(4) poison`, align 4, addrspace 4) $m0 = S_MOV_B32 -1 @@ -50,17 +59,22 @@ body: | ... --- -# GCN-LABEL: name: load_singlethread_acquire -# GCN-LABEL: bb.0: -# GCN-NOT: S_WAITCNT -# GCN: DS_READ_B32 -# GCN-NOT: S_WAITCNT -# GCN: FLAT_STORE_DWORD name: load_singlethread_acquire body: | bb.0: + ; GCN-LABEL: name: load_singlethread_acquire + ; GCN: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) + ; GCN-NEXT: $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load (s64) from `ptr addrspace(4) poison`, align 4, addrspace 4) + ; GCN-NEXT: $m0 = S_MOV_B32 -1 + ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GCN-NEXT: renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("singlethread-one-as") acquire (s32) from `ptr addrspace(3) poison`, addrspace 3) + ; GCN-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952 + ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 + ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec + ; GCN-NEXT: FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `ptr poison`) + ; GCN-NEXT: S_ENDPGM 0 $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load (s64) from `ptr addrspace(4) poison`, align 4, addrspace 4) $m0 = S_MOV_B32 -1 @@ -74,17 +88,23 @@ body: | ... --- -# GCN-LABEL: name: load_singlethread_seq_cst -# GCN-LABEL: bb.0: -# GCN-NOT: S_WAITCNT -# GCN: DS_READ_B32 -# GCN-NOT: S_WAITCNT -# GCN: FLAT_STORE_DWORD name: load_singlethread_seq_cst body: | bb.0: + ; GCN-LABEL: name: load_singlethread_seq_cst + ; GCN: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) + ; GCN-NEXT: $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load (s64) from `ptr addrspace(4) poison`, align 4, addrspace 4) + ; GCN-NEXT: $m0 = S_MOV_B32 -1 + ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GCN-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952 + ; GCN-NEXT: renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("singlethread-one-as") seq_cst (s32) from `ptr addrspace(3) poison`, addrspace 3) + ; GCN-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952 + ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 + ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec + ; GCN-NEXT: FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `ptr poison`) + ; GCN-NEXT: S_ENDPGM 0 $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load (s64) from `ptr addrspace(4) poison`, align 4, addrspace 4) $m0 = S_MOV_B32 -1 @@ -98,17 +118,21 @@ body: | ... --- -# GCN-LABEL: name: load_wavefront_unordered -# GCN-LABEL: bb.0: -# GCN-NOT: S_WAITCNT -# GCN: DS_READ_B32 -# GCN-NOT: S_WAITCNT -# GCN: FLAT_STORE_DWORD name: load_wavefront_unordered body: | bb.0: + ; GCN-LABEL: name: load_wavefront_unordered + ; GCN: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) + ; GCN-NEXT: $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load (s64) from `ptr addrspace(4) poison`, align 4, addrspace 4) + ; GCN-NEXT: $m0 = S_MOV_B32 -1 + ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GCN-NEXT: renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("wavefront-one-as") unordered (s32) from `ptr addrspace(3) poison`, addrspace 3) + ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 + ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec + ; GCN-NEXT: FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `ptr poison`) + ; GCN-NEXT: S_ENDPGM 0 $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load (s64) from `ptr addrspace(4) poison`, align 4, addrspace 4) $m0 = S_MOV_B32 -1 @@ -122,17 +146,21 @@ body: | ... --- -# GCN-LABEL: name: load_wavefront_monotonic -# GCN-LABEL: bb.0: -# GCN-NOT: S_WAITCNT -# GCN: DS_READ_B32 -# GCN-NOT: S_WAITCNT -# GCN: FLAT_STORE_DWORD name: load_wavefront_monotonic body: | bb.0: + ; GCN-LABEL: name: load_wavefront_monotonic + ; GCN: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) + ; GCN-NEXT: $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load (s64) from `ptr addrspace(4) poison`, align 4, addrspace 4) + ; GCN-NEXT: $m0 = S_MOV_B32 -1 + ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GCN-NEXT: renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("wavefront-one-as") monotonic (s32) from `ptr addrspace(3) poison`, addrspace 3) + ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 + ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec + ; GCN-NEXT: FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `ptr poison`) + ; GCN-NEXT: S_ENDPGM 0 $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load (s64) from `ptr addrspace(4) poison`, align 4, addrspace 4) $m0 = S_MOV_B32 -1 @@ -146,17 +174,22 @@ body: | ... --- -# GCN-LABEL: name: load_wavefront_acquire -# GCN-LABEL: bb.0: -# GCN-NOT: S_WAITCNT -# GCN: DS_READ_B32 -# GCN-NOT: S_WAITCNT -# GCN: FLAT_STORE_DWORD name: load_wavefront_acquire body: | bb.0: + ; GCN-LABEL: name: load_wavefront_acquire + ; GCN: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) + ; GCN-NEXT: $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load (s64) from `ptr addrspace(4) poison`, align 4, addrspace 4) + ; GCN-NEXT: $m0 = S_MOV_B32 -1 + ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GCN-NEXT: renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("wavefront-one-as") acquire (s32) from `ptr addrspace(3) poison`, addrspace 3) + ; GCN-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952 + ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 + ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec + ; GCN-NEXT: FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `ptr poison`) + ; GCN-NEXT: S_ENDPGM 0 $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load (s64) from `ptr addrspace(4) poison`, align 4, addrspace 4) $m0 = S_MOV_B32 -1 @@ -170,17 +203,23 @@ body: | ... --- -# GCN-LABEL: name: load_wavefront_seq_cst -# GCN-LABEL: bb.0: -# GCN-NOT: S_WAITCNT -# GCN: DS_READ_B32 -# GCN-NOT: S_WAITCNT -# GCN: FLAT_STORE_DWORD name: load_wavefront_seq_cst body: | bb.0: + ; GCN-LABEL: name: load_wavefront_seq_cst + ; GCN: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) + ; GCN-NEXT: $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load (s64) from `ptr addrspace(4) poison`, align 4, addrspace 4) + ; GCN-NEXT: $m0 = S_MOV_B32 -1 + ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GCN-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952 + ; GCN-NEXT: renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("wavefront-one-as") seq_cst (s32) from `ptr addrspace(3) poison`, addrspace 3) + ; GCN-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952 + ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 + ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec + ; GCN-NEXT: FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `ptr poison`) + ; GCN-NEXT: S_ENDPGM 0 $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load (s64) from `ptr addrspace(4) poison`, align 4, addrspace 4) $m0 = S_MOV_B32 -1 @@ -194,17 +233,21 @@ body: | ... --- -# GCN-LABEL: name: load_workgroup_unordered -# GCN-LABEL: bb.0: -# GCN-NOT: S_WAITCNT -# GCN: DS_READ_B32 -# GCN-NOT: S_WAITCNT -# GCN: FLAT_STORE_DWORD name: load_workgroup_unordered body: | bb.0: + ; GCN-LABEL: name: load_workgroup_unordered + ; GCN: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) + ; GCN-NEXT: $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load (s64) from `ptr addrspace(4) poison`, align 4, addrspace 4) + ; GCN-NEXT: $m0 = S_MOV_B32 -1 + ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GCN-NEXT: renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("workgroup-one-as") unordered (s32) from `ptr addrspace(3) poison`, addrspace 3) + ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 + ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec + ; GCN-NEXT: FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `ptr poison`) + ; GCN-NEXT: S_ENDPGM 0 $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load (s64) from `ptr addrspace(4) poison`, align 4, addrspace 4) $m0 = S_MOV_B32 -1 @@ -218,17 +261,21 @@ body: | ... --- -# GCN-LABEL: name: load_workgroup_monotonic -# GCN-LABEL: bb.0: -# GCN-NOT: S_WAITCNT -# GCN: DS_READ_B32 -# GCN-NOT: S_WAITCNT -# GCN: FLAT_STORE_DWORD name: load_workgroup_monotonic body: | bb.0: + ; GCN-LABEL: name: load_workgroup_monotonic + ; GCN: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) + ; GCN-NEXT: $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load (s64) from `ptr addrspace(4) poison`, align 4, addrspace 4) + ; GCN-NEXT: $m0 = S_MOV_B32 -1 + ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GCN-NEXT: renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("workgroup-one-as") monotonic (s32) from `ptr addrspace(3) poison`, addrspace 3) + ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 + ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec + ; GCN-NEXT: FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `ptr poison`) + ; GCN-NEXT: S_ENDPGM 0 $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load (s64) from `ptr addrspace(4) poison`, align 4, addrspace 4) $m0 = S_MOV_B32 -1 @@ -242,17 +289,22 @@ body: | ... --- -# GCN-LABEL: name: load_workgroup_acquire -# GCN-LABEL: bb.0: -# GCN-NOT: S_WAITCNT -# GCN: DS_READ_B32 -# GCN-NOT: S_WAITCNT -# GCN: FLAT_STORE_DWORD name: load_workgroup_acquire body: | bb.0: + ; GCN-LABEL: name: load_workgroup_acquire + ; GCN: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) + ; GCN-NEXT: $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load (s64) from `ptr addrspace(4) poison`, align 4, addrspace 4) + ; GCN-NEXT: $m0 = S_MOV_B32 -1 + ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GCN-NEXT: renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("workgroup-one-as") acquire (s32) from `ptr addrspace(3) poison`, addrspace 3) + ; GCN-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952 + ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 + ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec + ; GCN-NEXT: FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `ptr poison`) + ; GCN-NEXT: S_ENDPGM 0 $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load (s64) from `ptr addrspace(4) poison`, align 4, addrspace 4) $m0 = S_MOV_B32 -1 @@ -266,17 +318,23 @@ body: | ... --- -# GCN-LABEL: name: load_workgroup_seq_cst -# GCN-LABEL: bb.0: -# GCN-NOT: S_WAITCNT -# GCN: DS_READ_B32 -# GCN-NOT: S_WAITCNT -# GCN: FLAT_STORE_DWORD name: load_workgroup_seq_cst body: | bb.0: + ; GCN-LABEL: name: load_workgroup_seq_cst + ; GCN: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) + ; GCN-NEXT: $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load (s64) from `ptr addrspace(4) poison`, align 4, addrspace 4) + ; GCN-NEXT: $m0 = S_MOV_B32 -1 + ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GCN-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952 + ; GCN-NEXT: renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("workgroup-one-as") seq_cst (s32) from `ptr addrspace(3) poison`, addrspace 3) + ; GCN-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952 + ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 + ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec + ; GCN-NEXT: FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `ptr poison`) + ; GCN-NEXT: S_ENDPGM 0 $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load (s64) from `ptr addrspace(4) poison`, align 4, addrspace 4) $m0 = S_MOV_B32 -1 @@ -290,17 +348,21 @@ body: | ... --- -# GCN-LABEL: name: load_agent_unordered -# GCN-LABEL: bb.0: -# GCN-NOT: S_WAITCNT -# GCN: DS_READ_B32 -# GCN-NOT: S_WAITCNT -# GCN: FLAT_STORE_DWORD name: load_agent_unordered body: | bb.0: + ; GCN-LABEL: name: load_agent_unordered + ; GCN: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) + ; GCN-NEXT: $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load (s64) from `ptr addrspace(4) poison`, align 4, addrspace 4) + ; GCN-NEXT: $m0 = S_MOV_B32 -1 + ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GCN-NEXT: renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("agent-one-as") unordered (s32) from `ptr addrspace(3) poison`, addrspace 3) + ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 + ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec + ; GCN-NEXT: FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `ptr poison`) + ; GCN-NEXT: S_ENDPGM 0 $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load (s64) from `ptr addrspace(4) poison`, align 4, addrspace 4) $m0 = S_MOV_B32 -1 @@ -314,17 +376,21 @@ body: | ... --- -# GCN-LABEL: name: load_agent_monotonic -# GCN-LABEL: bb.0: -# GCN-NOT: S_WAITCNT -# GCN: DS_READ_B32 -# GCN-NOT: S_WAITCNT -# GCN: FLAT_STORE_DWORD name: load_agent_monotonic body: | bb.0: + ; GCN-LABEL: name: load_agent_monotonic + ; GCN: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) + ; GCN-NEXT: $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load (s64) from `ptr addrspace(4) poison`, align 4, addrspace 4) + ; GCN-NEXT: $m0 = S_MOV_B32 -1 + ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GCN-NEXT: renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("agent-one-as") monotonic (s32) from `ptr addrspace(3) poison`, addrspace 3) + ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 + ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec + ; GCN-NEXT: FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `ptr poison`) + ; GCN-NEXT: S_ENDPGM 0 $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load (s64) from `ptr addrspace(4) poison`, align 4, addrspace 4) $m0 = S_MOV_B32 -1 @@ -338,17 +404,22 @@ body: | ... --- -# GCN-LABEL: name: load_agent_acquire -# GCN-LABEL: bb.0: -# GCN-NOT: S_WAITCNT -# GCN: DS_READ_B32 -# GCN-NOT: S_WAITCNT -# GCN: FLAT_STORE_DWORD name: load_agent_acquire body: | bb.0: + ; GCN-LABEL: name: load_agent_acquire + ; GCN: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) + ; GCN-NEXT: $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load (s64) from `ptr addrspace(4) poison`, align 4, addrspace 4) + ; GCN-NEXT: $m0 = S_MOV_B32 -1 + ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GCN-NEXT: renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("agent-one-as") acquire (s32) from `ptr addrspace(3) poison`, addrspace 3) + ; GCN-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952 + ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 + ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec + ; GCN-NEXT: FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `ptr poison`) + ; GCN-NEXT: S_ENDPGM 0 $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load (s64) from `ptr addrspace(4) poison`, align 4, addrspace 4) $m0 = S_MOV_B32 -1 @@ -362,17 +433,23 @@ body: | ... --- -# GCN-LABEL: name: load_agent_seq_cst -# GCN-LABEL: bb.0: -# GCN-NOT: S_WAITCNT -# GCN: DS_READ_B32 -# GCN-NOT: S_WAITCNT -# GCN: FLAT_STORE_DWORD name: load_agent_seq_cst body: | bb.0: + ; GCN-LABEL: name: load_agent_seq_cst + ; GCN: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) + ; GCN-NEXT: $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load (s64) from `ptr addrspace(4) poison`, align 4, addrspace 4) + ; GCN-NEXT: $m0 = S_MOV_B32 -1 + ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GCN-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952 + ; GCN-NEXT: renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("agent-one-as") seq_cst (s32) from `ptr addrspace(3) poison`, addrspace 3) + ; GCN-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952 + ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 + ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec + ; GCN-NEXT: FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `ptr poison`) + ; GCN-NEXT: S_ENDPGM 0 $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load (s64) from `ptr addrspace(4) poison`, align 4, addrspace 4) $m0 = S_MOV_B32 -1 @@ -386,17 +463,21 @@ body: | ... --- -# GCN-LABEL: name: load_system_unordered -# GCN-LABEL: bb.0: -# GCN-NOT: S_WAITCNT -# GCN: DS_READ_B32 -# GCN-NOT: S_WAITCNT -# GCN: FLAT_STORE_DWORD name: load_system_unordered body: | bb.0: + ; GCN-LABEL: name: load_system_unordered + ; GCN: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) + ; GCN-NEXT: $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load (s64) from `ptr addrspace(4) poison`, align 4, addrspace 4) + ; GCN-NEXT: $m0 = S_MOV_B32 -1 + ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GCN-NEXT: renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("one-as") unordered (s32) from `ptr addrspace(3) poison`, addrspace 3) + ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 + ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec + ; GCN-NEXT: FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `ptr poison`) + ; GCN-NEXT: S_ENDPGM 0 $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load (s64) from `ptr addrspace(4) poison`, align 4, addrspace 4) $m0 = S_MOV_B32 -1 @@ -410,17 +491,21 @@ body: | ... --- -# GCN-LABEL: name: load_system_monotonic -# GCN-LABEL: bb.0: -# GCN-NOT: S_WAITCNT -# GCN: DS_READ_B32 -# GCN-NOT: S_WAITCNT -# GCN: FLAT_STORE_DWORD name: load_system_monotonic body: | bb.0: + ; GCN-LABEL: name: load_system_monotonic + ; GCN: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) + ; GCN-NEXT: $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load (s64) from `ptr addrspace(4) poison`, align 4, addrspace 4) + ; GCN-NEXT: $m0 = S_MOV_B32 -1 + ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GCN-NEXT: renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("one-as") monotonic (s32) from `ptr addrspace(3) poison`, addrspace 3) + ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 + ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec + ; GCN-NEXT: FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `ptr poison`) + ; GCN-NEXT: S_ENDPGM 0 $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load (s64) from `ptr addrspace(4) poison`, align 4, addrspace 4) $m0 = S_MOV_B32 -1 @@ -434,17 +519,22 @@ body: | ... --- -# GCN-LABEL: name: load_system_acquire -# GCN-LABEL: bb.0: -# GCN-NOT: S_WAITCNT -# GCN: DS_READ_B32 -# GCN-NOT: S_WAITCNT -# GCN: FLAT_STORE_DWORD name: load_system_acquire body: | bb.0: + ; GCN-LABEL: name: load_system_acquire + ; GCN: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) + ; GCN-NEXT: $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load (s64) from `ptr addrspace(4) poison`, align 4, addrspace 4) + ; GCN-NEXT: $m0 = S_MOV_B32 -1 + ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GCN-NEXT: renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("one-as") acquire (s32) from `ptr addrspace(3) poison`, addrspace 3) + ; GCN-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952 + ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 + ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec + ; GCN-NEXT: FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `ptr poison`) + ; GCN-NEXT: S_ENDPGM 0 $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load (s64) from `ptr addrspace(4) poison`, align 4, addrspace 4) $m0 = S_MOV_B32 -1 @@ -458,17 +548,23 @@ body: | ... --- -# GCN-LABEL: name: load_system_seq_cst -# GCN-LABEL: bb.0: -# GCN-NOT: S_WAITCNT -# GCN: DS_READ_B32 -# GCN-NOT: S_WAITCNT -# GCN: FLAT_STORE_DWORD name: load_system_seq_cst body: | bb.0: + ; GCN-LABEL: name: load_system_seq_cst + ; GCN: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) + ; GCN-NEXT: $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load (s64) from `ptr addrspace(4) poison`, align 4, addrspace 4) + ; GCN-NEXT: $m0 = S_MOV_B32 -1 + ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GCN-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952 + ; GCN-NEXT: renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("one-as") seq_cst (s32) from `ptr addrspace(3) poison`, addrspace 3) + ; GCN-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952 + ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 + ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec + ; GCN-NEXT: FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `ptr poison`) + ; GCN-NEXT: S_ENDPGM 0 $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load (s64) from `ptr addrspace(4) poison`, align 4, addrspace 4) $m0 = S_MOV_B32 -1 @@ -482,17 +578,19 @@ body: | ... --- -# GCN-LABEL: name: store_singlethread_unordered -# GCN-LABEL: bb.0: -# GCN-NOT: S_WAITCNT -# GCN: DS_WRITE_B32 -# GCN-NOT: S_WAITCNT -# GCN: S_ENDPGM 0 name: store_singlethread_unordered body: | bb.0: + ; GCN-LABEL: name: store_singlethread_unordered + ; GCN: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) + ; GCN-NEXT: $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, align 8, addrspace 4) + ; GCN-NEXT: $m0 = S_MOV_B32 -1 + ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GCN-NEXT: DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("singlethread-one-as") unordered (s32) into `ptr addrspace(3) poison`, addrspace 3) + ; GCN-NEXT: S_ENDPGM 0 $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, align 8, addrspace 4) $m0 = S_MOV_B32 -1 @@ -504,17 +602,19 @@ body: | ... --- -# GCN-LABEL: name: store_singlethread_monotonic -# GCN-LABEL: bb.0: -# GCN-NOT: S_WAITCNT -# GCN: DS_WRITE_B32 -# GCN-NOT: S_WAITCNT -# GCN: S_ENDPGM 0 name: store_singlethread_monotonic body: | bb.0: + ; GCN-LABEL: name: store_singlethread_monotonic + ; GCN: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) + ; GCN-NEXT: $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, align 8, addrspace 4) + ; GCN-NEXT: $m0 = S_MOV_B32 -1 + ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GCN-NEXT: DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("singlethread-one-as") monotonic (s32) into `ptr addrspace(3) poison`, addrspace 3) + ; GCN-NEXT: S_ENDPGM 0 $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, align 8, addrspace 4) $m0 = S_MOV_B32 -1 @@ -526,17 +626,20 @@ body: | ... --- -# GCN-LABEL: name: store_singlethread_release -# GCN-LABEL: bb.0: -# GCN-NOT: S_WAITCNT -# GCN: DS_WRITE_B32 -# GCN-NOT: S_WAITCNT -# GCN: S_ENDPGM 0 name: store_singlethread_release body: | bb.0: + ; GCN-LABEL: name: store_singlethread_release + ; GCN: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) + ; GCN-NEXT: $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, align 8, addrspace 4) + ; GCN-NEXT: $m0 = S_MOV_B32 -1 + ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GCN-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952 + ; GCN-NEXT: DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("singlethread-one-as") release (s32) into `ptr addrspace(3) poison`, addrspace 3) + ; GCN-NEXT: S_ENDPGM 0 $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, align 8, addrspace 4) $m0 = S_MOV_B32 -1 @@ -548,17 +651,20 @@ body: | ... --- -# GCN-LABEL: name: store_singlethread_seq_cst -# GCN-LABEL: bb.0: -# GCN-NOT: S_WAITCNT -# GCN: DS_WRITE_B32 -# GCN-NOT: S_WAITCNT -# GCN: S_ENDPGM 0 name: store_singlethread_seq_cst body: | bb.0: + ; GCN-LABEL: name: store_singlethread_seq_cst + ; GCN: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) + ; GCN-NEXT: $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, align 8, addrspace 4) + ; GCN-NEXT: $m0 = S_MOV_B32 -1 + ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GCN-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952 + ; GCN-NEXT: DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("singlethread-one-as") seq_cst (s32) into `ptr addrspace(3) poison`, addrspace 3) + ; GCN-NEXT: S_ENDPGM 0 $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, align 8, addrspace 4) $m0 = S_MOV_B32 -1 @@ -570,17 +676,19 @@ body: | ... --- -# GCN-LABEL: name: store_wavefront_unordered -# GCN-LABEL: bb.0: -# GCN-NOT: S_WAITCNT -# GCN: DS_WRITE_B32 -# GCN-NOT: S_WAITCNT -# GCN: S_ENDPGM 0 name: store_wavefront_unordered body: | bb.0: + ; GCN-LABEL: name: store_wavefront_unordered + ; GCN: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) + ; GCN-NEXT: $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, align 8, addrspace 4) + ; GCN-NEXT: $m0 = S_MOV_B32 -1 + ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GCN-NEXT: DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("wavefront-one-as") unordered (s32) into `ptr addrspace(3) poison`, addrspace 3) + ; GCN-NEXT: S_ENDPGM 0 $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, align 8, addrspace 4) $m0 = S_MOV_B32 -1 @@ -592,17 +700,19 @@ body: | ... --- -# GCN-LABEL: name: store_wavefront_monotonic -# GCN-LABEL: bb.0: -# GCN-NOT: S_WAITCNT -# GCN: DS_WRITE_B32 -# GCN-NOT: S_WAITCNT -# GCN: S_ENDPGM 0 name: store_wavefront_monotonic body: | bb.0: + ; GCN-LABEL: name: store_wavefront_monotonic + ; GCN: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) + ; GCN-NEXT: $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, align 8, addrspace 4) + ; GCN-NEXT: $m0 = S_MOV_B32 -1 + ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GCN-NEXT: DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("wavefront-one-as") monotonic (s32) into `ptr addrspace(3) poison`, addrspace 3) + ; GCN-NEXT: S_ENDPGM 0 $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, align 8, addrspace 4) $m0 = S_MOV_B32 -1 @@ -614,17 +724,20 @@ body: | ... --- -# GCN-LABEL: name: store_wavefront_release -# GCN-LABEL: bb.0: -# GCN-NOT: S_WAITCNT -# GCN: DS_WRITE_B32 -# GCN-NOT: S_WAITCNT -# GCN: S_ENDPGM 0 name: store_wavefront_release body: | bb.0: + ; GCN-LABEL: name: store_wavefront_release + ; GCN: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) + ; GCN-NEXT: $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, align 8, addrspace 4) + ; GCN-NEXT: $m0 = S_MOV_B32 -1 + ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GCN-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952 + ; GCN-NEXT: DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("wavefront-one-as") release (s32) into `ptr addrspace(3) poison`, addrspace 3) + ; GCN-NEXT: S_ENDPGM 0 $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, align 8, addrspace 4) $m0 = S_MOV_B32 -1 @@ -636,17 +749,20 @@ body: | ... --- -# GCN-LABEL: name: store_wavefront_seq_cst -# GCN-LABEL: bb.0: -# GCN-NOT: S_WAITCNT -# GCN: DS_WRITE_B32 -# GCN-NOT: S_WAITCNT -# GCN: S_ENDPGM 0 name: store_wavefront_seq_cst body: | bb.0: + ; GCN-LABEL: name: store_wavefront_seq_cst + ; GCN: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) + ; GCN-NEXT: $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, align 8, addrspace 4) + ; GCN-NEXT: $m0 = S_MOV_B32 -1 + ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GCN-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952 + ; GCN-NEXT: DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("wavefront-one-as") seq_cst (s32) into `ptr addrspace(3) poison`, addrspace 3) + ; GCN-NEXT: S_ENDPGM 0 $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, align 8, addrspace 4) $m0 = S_MOV_B32 -1 @@ -658,17 +774,19 @@ body: | ... --- -# GCN-LABEL: name: store_workgroup_unordered -# GCN-LABEL: bb.0: -# GCN-NOT: S_WAITCNT -# GCN: DS_WRITE_B32 -# GCN-NOT: S_WAITCNT -# GCN: S_ENDPGM 0 name: store_workgroup_unordered body: | bb.0: + ; GCN-LABEL: name: store_workgroup_unordered + ; GCN: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) + ; GCN-NEXT: $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, align 8, addrspace 4) + ; GCN-NEXT: $m0 = S_MOV_B32 -1 + ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GCN-NEXT: DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("workgroup-one-as") unordered (s32) into `ptr addrspace(3) poison`, addrspace 3) + ; GCN-NEXT: S_ENDPGM 0 $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, align 8, addrspace 4) $m0 = S_MOV_B32 -1 @@ -680,17 +798,19 @@ body: | ... --- -# GCN-LABEL: name: store_workgroup_monotonic -# GCN-LABEL: bb.0: -# GCN-NOT: S_WAITCNT -# GCN: DS_WRITE_B32 -# GCN-NOT: S_WAITCNT -# GCN: S_ENDPGM 0 name: store_workgroup_monotonic body: | bb.0: + ; GCN-LABEL: name: store_workgroup_monotonic + ; GCN: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) + ; GCN-NEXT: $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, align 8, addrspace 4) + ; GCN-NEXT: $m0 = S_MOV_B32 -1 + ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GCN-NEXT: DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("workgroup-one-as") monotonic (s32) into `ptr addrspace(3) poison`, addrspace 3) + ; GCN-NEXT: S_ENDPGM 0 $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, align 8, addrspace 4) $m0 = S_MOV_B32 -1 @@ -702,17 +822,20 @@ body: | ... --- -# GCN-LABEL: name: store_workgroup_release -# GCN-LABEL: bb.0: -# GCN-NOT: S_WAITCNT -# GCN: DS_WRITE_B32 -# GCN-NOT: S_WAITCNT -# GCN: S_ENDPGM 0 name: store_workgroup_release body: | bb.0: + ; GCN-LABEL: name: store_workgroup_release + ; GCN: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) + ; GCN-NEXT: $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, align 8, addrspace 4) + ; GCN-NEXT: $m0 = S_MOV_B32 -1 + ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GCN-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952 + ; GCN-NEXT: DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("workgroup-one-as") release (s32) into `ptr addrspace(3) poison`, addrspace 3) + ; GCN-NEXT: S_ENDPGM 0 $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, align 8, addrspace 4) $m0 = S_MOV_B32 -1 @@ -724,17 +847,20 @@ body: | ... --- -# GCN-LABEL: name: store_workgroup_seq_cst -# GCN-LABEL: bb.0: -# GCN-NOT: S_WAITCNT -# GCN: DS_WRITE_B32 -# GCN-NOT: S_WAITCNT -# GCN: S_ENDPGM 0 name: store_workgroup_seq_cst body: | bb.0: + ; GCN-LABEL: name: store_workgroup_seq_cst + ; GCN: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) + ; GCN-NEXT: $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, align 8, addrspace 4) + ; GCN-NEXT: $m0 = S_MOV_B32 -1 + ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GCN-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952 + ; GCN-NEXT: DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("workgroup-one-as") seq_cst (s32) into `ptr addrspace(3) poison`, addrspace 3) + ; GCN-NEXT: S_ENDPGM 0 $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, align 8, addrspace 4) $m0 = S_MOV_B32 -1 @@ -746,17 +872,19 @@ body: | ... --- -# GCN-LABEL: name: store_agent_unordered -# GCN-LABEL: bb.0: -# GCN-NOT: S_WAITCNT -# GCN: DS_WRITE_B32 -# GCN-NOT: S_WAITCNT -# GCN: S_ENDPGM 0 name: store_agent_unordered body: | bb.0: + ; GCN-LABEL: name: store_agent_unordered + ; GCN: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) + ; GCN-NEXT: $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, align 8, addrspace 4) + ; GCN-NEXT: $m0 = S_MOV_B32 -1 + ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GCN-NEXT: DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("agent-one-as") unordered (s32) into `ptr addrspace(3) poison`, addrspace 3) + ; GCN-NEXT: S_ENDPGM 0 $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, align 8, addrspace 4) $m0 = S_MOV_B32 -1 @@ -768,17 +896,19 @@ body: | ... --- -# GCN-LABEL: name: store_agent_monotonic -# GCN-LABEL: bb.0: -# GCN-NOT: S_WAITCNT -# GCN: DS_WRITE_B32 -# GCN-NOT: S_WAITCNT -# GCN: S_ENDPGM 0 name: store_agent_monotonic body: | bb.0: + ; GCN-LABEL: name: store_agent_monotonic + ; GCN: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) + ; GCN-NEXT: $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, align 8, addrspace 4) + ; GCN-NEXT: $m0 = S_MOV_B32 -1 + ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GCN-NEXT: DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("agent-one-as") monotonic (s32) into `ptr addrspace(3) poison`, addrspace 3) + ; GCN-NEXT: S_ENDPGM 0 $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, align 8, addrspace 4) $m0 = S_MOV_B32 -1 @@ -790,17 +920,20 @@ body: | ... --- -# GCN-LABEL: name: store_agent_release -# GCN-LABEL: bb.0: -# GCN-NOT: S_WAITCNT -# GCN: DS_WRITE_B32 -# GCN-NOT: S_WAITCNT -# GCN: S_ENDPGM 0 name: store_agent_release body: | bb.0: + ; GCN-LABEL: name: store_agent_release + ; GCN: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) + ; GCN-NEXT: $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, align 8, addrspace 4) + ; GCN-NEXT: $m0 = S_MOV_B32 -1 + ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GCN-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952 + ; GCN-NEXT: DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("agent-one-as") release (s32) into `ptr addrspace(3) poison`, addrspace 3) + ; GCN-NEXT: S_ENDPGM 0 $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, align 8, addrspace 4) $m0 = S_MOV_B32 -1 @@ -812,17 +945,20 @@ body: | ... --- -# GCN-LABEL: name: store_agent_seq_cst -# GCN-LABEL: bb.0: -# GCN-NOT: S_WAITCNT -# GCN: DS_WRITE_B32 -# GCN-NOT: S_WAITCNT -# GCN: S_ENDPGM 0 name: store_agent_seq_cst body: | bb.0: + ; GCN-LABEL: name: store_agent_seq_cst + ; GCN: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) + ; GCN-NEXT: $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, align 8, addrspace 4) + ; GCN-NEXT: $m0 = S_MOV_B32 -1 + ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GCN-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952 + ; GCN-NEXT: DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("agent-one-as") seq_cst (s32) into `ptr addrspace(3) poison`, addrspace 3) + ; GCN-NEXT: S_ENDPGM 0 $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, align 8, addrspace 4) $m0 = S_MOV_B32 -1 @@ -834,17 +970,19 @@ body: | ... --- -# GCN-LABEL: name: store_system_unordered -# GCN-LABEL: bb.0: -# GCN-NOT: S_WAITCNT -# GCN: DS_WRITE_B32 -# GCN-NOT: S_WAITCNT -# GCN: S_ENDPGM 0 name: store_system_unordered body: | bb.0: + ; GCN-LABEL: name: store_system_unordered + ; GCN: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) + ; GCN-NEXT: $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, align 8, addrspace 4) + ; GCN-NEXT: $m0 = S_MOV_B32 -1 + ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GCN-NEXT: DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("one-as") unordered (s32) into `ptr addrspace(3) poison`, addrspace 3) + ; GCN-NEXT: S_ENDPGM 0 $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, align 8, addrspace 4) $m0 = S_MOV_B32 -1 @@ -856,17 +994,19 @@ body: | ... --- -# GCN-LABEL: name: store_system_monotonic -# GCN-LABEL: bb.0: -# GCN-NOT: S_WAITCNT -# GCN: DS_WRITE_B32 -# GCN-NOT: S_WAITCNT -# GCN: S_ENDPGM 0 name: store_system_monotonic body: | bb.0: + ; GCN-LABEL: name: store_system_monotonic + ; GCN: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) + ; GCN-NEXT: $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, align 8, addrspace 4) + ; GCN-NEXT: $m0 = S_MOV_B32 -1 + ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GCN-NEXT: DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("one-as") monotonic (s32) into `ptr addrspace(3) poison`, addrspace 3) + ; GCN-NEXT: S_ENDPGM 0 $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, align 8, addrspace 4) $m0 = S_MOV_B32 -1 @@ -878,17 +1018,20 @@ body: | ... --- -# GCN-LABEL: name: store_system_release -# GCN-LABEL: bb.0: -# GCN-NOT: S_WAITCNT -# GCN: DS_WRITE_B32 -# GCN-NOT: S_WAITCNT -# GCN: S_ENDPGM 0 name: store_system_release body: | bb.0: + ; GCN-LABEL: name: store_system_release + ; GCN: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) + ; GCN-NEXT: $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, align 8, addrspace 4) + ; GCN-NEXT: $m0 = S_MOV_B32 -1 + ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GCN-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952 + ; GCN-NEXT: DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("one-as") release (s32) into `ptr addrspace(3) poison`, addrspace 3) + ; GCN-NEXT: S_ENDPGM 0 $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, align 8, addrspace 4) $m0 = S_MOV_B32 -1 @@ -900,17 +1043,20 @@ body: | ... --- -# GCN-LABEL: name: store_system_seq_cst -# GCN-LABEL: bb.0: -# GCN-NOT: S_WAITCNT -# GCN: DS_WRITE_B32 -# GCN-NOT: S_WAITCNT -# GCN: S_ENDPGM 0 name: store_system_seq_cst body: | bb.0: + ; GCN-LABEL: name: store_system_seq_cst + ; GCN: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) + ; GCN-NEXT: $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, align 8, addrspace 4) + ; GCN-NEXT: $m0 = S_MOV_B32 -1 + ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GCN-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952 + ; GCN-NEXT: DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("one-as") seq_cst (s32) into `ptr addrspace(3) poison`, addrspace 3) + ; GCN-NEXT: S_ENDPGM 0 $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, align 8, addrspace 4) $m0 = S_MOV_B32 -1 @@ -922,17 +1068,19 @@ body: | ... --- -# GCN-LABEL: name: atomicrmw_singlethread_unordered -# GCN-LABEL: bb.0: -# GCN-NOT: S_WAITCNT -# GCN: DS_WRXCHG_RTN_B32 -# GCN-NOT: S_WAITCNT -# GCN: S_ENDPGM 0 name: atomicrmw_singlethread_unordered body: | bb.0: + ; GCN-LABEL: name: atomicrmw_singlethread_unordered + ; GCN: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) + ; GCN-NEXT: $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, align 8, addrspace 4) + ; GCN-NEXT: $m0 = S_MOV_B32 -1 + ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GCN-NEXT: $vgpr2 = DS_WRXCHG_RTN_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("singlethread-one-as") unordered (s32) into `ptr addrspace(3) poison`, addrspace 3) + ; GCN-NEXT: S_ENDPGM 0 $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, align 8, addrspace 4) $m0 = S_MOV_B32 -1 @@ -944,17 +1092,19 @@ body: | ... --- -# GCN-LABEL: name: atomicrmw_singlethread_monotonic -# GCN-LABEL: bb.0: -# GCN-NOT: S_WAITCNT -# GCN: DS_WRXCHG_RTN_B32 -# GCN-NOT: S_WAITCNT -# GCN: S_ENDPGM 0 name: atomicrmw_singlethread_monotonic body: | bb.0: + ; GCN-LABEL: name: atomicrmw_singlethread_monotonic + ; GCN: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) + ; GCN-NEXT: $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, align 8, addrspace 4) + ; GCN-NEXT: $m0 = S_MOV_B32 -1 + ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GCN-NEXT: $vgpr2 = DS_WRXCHG_RTN_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("singlethread-one-as") monotonic (s32) into `ptr addrspace(3) poison`, addrspace 3) + ; GCN-NEXT: S_ENDPGM 0 $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, align 8, addrspace 4) $m0 = S_MOV_B32 -1 @@ -966,17 +1116,20 @@ body: | ... --- -# GCN-LABEL: name: atomicrmw_singlethread_acquire -# GCN-LABEL: bb.0: -# GCN-NOT: S_WAITCNT -# GCN: DS_WRXCHG_RTN_B32 -# GCN-NOT: S_WAITCNT -# GCN: S_ENDPGM 0 name: atomicrmw_singlethread_acquire body: | bb.0: + ; GCN-LABEL: name: atomicrmw_singlethread_acquire + ; GCN: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) + ; GCN-NEXT: $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, align 8, addrspace 4) + ; GCN-NEXT: $m0 = S_MOV_B32 -1 + ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GCN-NEXT: $vgpr2 = DS_WRXCHG_RTN_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("singlethread-one-as") acquire (s32) into `ptr addrspace(3) poison`, addrspace 3) + ; GCN-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952 + ; GCN-NEXT: S_ENDPGM 0 $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, align 8, addrspace 4) $m0 = S_MOV_B32 -1 @@ -988,17 +1141,20 @@ body: | ... --- -# GCN-LABEL: name: atomicrmw_singlethread_release -# GCN-LABEL: bb.0: -# GCN-NOT: S_WAITCNT -# GCN: DS_WRXCHG_RTN_B32 -# GCN-NOT: S_WAITCNT -# GCN: S_ENDPGM 0 name: atomicrmw_singlethread_release body: | bb.0: + ; GCN-LABEL: name: atomicrmw_singlethread_release + ; GCN: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) + ; GCN-NEXT: $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, align 8, addrspace 4) + ; GCN-NEXT: $m0 = S_MOV_B32 -1 + ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GCN-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952 + ; GCN-NEXT: $vgpr2 = DS_WRXCHG_RTN_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("singlethread-one-as") release (s32) into `ptr addrspace(3) poison`, addrspace 3) + ; GCN-NEXT: S_ENDPGM 0 $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, align 8, addrspace 4) $m0 = S_MOV_B32 -1 @@ -1010,17 +1166,21 @@ body: | ... --- -# GCN-LABEL: name: atomicrmw_singlethread_acq_rel -# GCN-LABEL: bb.0: -# GCN-NOT: S_WAITCNT -# GCN: DS_WRXCHG_RTN_B32 -# GCN-NOT: S_WAITCNT -# GCN: S_ENDPGM 0 name: atomicrmw_singlethread_acq_rel body: | bb.0: + ; GCN-LABEL: name: atomicrmw_singlethread_acq_rel + ; GCN: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) + ; GCN-NEXT: $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, align 8, addrspace 4) + ; GCN-NEXT: $m0 = S_MOV_B32 -1 + ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GCN-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952 + ; GCN-NEXT: $vgpr2 = DS_WRXCHG_RTN_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("singlethread-one-as") acq_rel (s32) into `ptr addrspace(3) poison`, addrspace 3) + ; GCN-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952 + ; GCN-NEXT: S_ENDPGM 0 $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, align 8, addrspace 4) $m0 = S_MOV_B32 -1 @@ -1032,17 +1192,21 @@ body: | ... --- -# GCN-LABEL: name: atomicrmw_singlethread_seq_cst -# GCN-LABEL: bb.0: -# GCN-NOT: S_WAITCNT -# GCN: DS_WRXCHG_RTN_B32 -# GCN-NOT: S_WAITCNT -# GCN: S_ENDPGM 0 name: atomicrmw_singlethread_seq_cst body: | bb.0: + ; GCN-LABEL: name: atomicrmw_singlethread_seq_cst + ; GCN: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) + ; GCN-NEXT: $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, align 8, addrspace 4) + ; GCN-NEXT: $m0 = S_MOV_B32 -1 + ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GCN-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952 + ; GCN-NEXT: $vgpr2 = DS_WRXCHG_RTN_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("singlethread-one-as") seq_cst (s32) into `ptr addrspace(3) poison`, addrspace 3) + ; GCN-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952 + ; GCN-NEXT: S_ENDPGM 0 $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, align 8, addrspace 4) $m0 = S_MOV_B32 -1 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-region.mir b/llvm/test/CodeGen/AMDGPU/memory-legalizer-region.mir index 9405c8a946627..fd5c715ad0c60 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-region.mir +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-region.mir @@ -1,18 +1,23 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5 # RUN: llc -mtriple=amdgcn -mcpu=gfx803 -run-pass si-memory-legalizer %s -o - | FileCheck -check-prefix=GCN %s --- -# GCN-LABEL: name: load_singlethread_unordered -# GCN-LABEL: bb.0: -# GCN-NOT: S_WAITCNT -# GCN: DS_READ_B32 -# GCN-NOT: S_WAITCNT -# GCN: FLAT_STORE_DWORD name: load_singlethread_unordered body: | bb.0: + ; GCN-LABEL: name: load_singlethread_unordered + ; GCN: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) + ; GCN-NEXT: $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load (s64) from `ptr addrspace(4) poison`, align 4, addrspace 4) + ; GCN-NEXT: $m0 = S_MOV_B32 -1 + ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GCN-NEXT: renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 1, 0, implicit $m0, implicit $exec :: (volatile load syncscope("singlethread-one-as") unordered (s32) from `ptr addrspace(2) poison`, addrspace 2) + ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 + ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec + ; GCN-NEXT: FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `ptr poison`) + ; GCN-NEXT: S_ENDPGM 0 $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load (s64) from `ptr addrspace(4) poison`, align 4, addrspace 4) $m0 = S_MOV_B32 -1 @@ -26,17 +31,21 @@ body: | ... --- -# GCN-LABEL: name: load_singlethread_monotonic -# GCN-LABEL: bb.0: -# GCN-NOT: S_WAITCNT -# GCN: DS_READ_B32 -# GCN-NOT: S_WAITCNT -# GCN: FLAT_STORE_DWORD name: load_singlethread_monotonic body: | bb.0: + ; GCN-LABEL: name: load_singlethread_monotonic + ; GCN: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) + ; GCN-NEXT: $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load (s64) from `ptr addrspace(4) poison`, align 4, addrspace 4) + ; GCN-NEXT: $m0 = S_MOV_B32 -1 + ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GCN-NEXT: renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load syncscope("singlethread-one-as") monotonic (s32) from `ptr addrspace(2) poison`, addrspace 2) + ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 + ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec + ; GCN-NEXT: FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `ptr poison`) + ; GCN-NEXT: S_ENDPGM 0 $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load (s64) from `ptr addrspace(4) poison`, align 4, addrspace 4) $m0 = S_MOV_B32 -1 @@ -50,17 +59,21 @@ body: | ... --- -# GCN-LABEL: name: load_singlethread_acquire -# GCN-LABEL: bb.0: -# GCN-NOT: S_WAITCNT -# GCN: DS_READ_B32 -# GCN-NOT: S_WAITCNT -# GCN: FLAT_STORE_DWORD name: load_singlethread_acquire body: | bb.0: + ; GCN-LABEL: name: load_singlethread_acquire + ; GCN: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) + ; GCN-NEXT: $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load (s64) from `ptr addrspace(4) poison`, align 4, addrspace 4) + ; GCN-NEXT: $m0 = S_MOV_B32 -1 + ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GCN-NEXT: renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load syncscope("singlethread-one-as") acquire (s32) from `ptr addrspace(2) poison`, addrspace 2) + ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 + ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec + ; GCN-NEXT: FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `ptr poison`) + ; GCN-NEXT: S_ENDPGM 0 $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load (s64) from `ptr addrspace(4) poison`, align 4, addrspace 4) $m0 = S_MOV_B32 -1 @@ -74,17 +87,22 @@ body: | ... --- -# GCN-LABEL: name: load_singlethread_seq_cst -# GCN-LABEL: bb.0: -# GCN-NOT: S_WAITCNT -# GCN: DS_READ_B32 -# GCN-NOT: S_WAITCNT -# GCN: FLAT_STORE_DWORD name: load_singlethread_seq_cst body: | bb.0: + ; GCN-LABEL: name: load_singlethread_seq_cst + ; GCN: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) + ; GCN-NEXT: $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load (s64) from `ptr addrspace(4) poison`, align 4, addrspace 4) + ; GCN-NEXT: $m0 = S_MOV_B32 -1 + ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GCN-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952 + ; GCN-NEXT: renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load syncscope("singlethread-one-as") seq_cst (s32) from `ptr addrspace(2) poison`, addrspace 2) + ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 + ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec + ; GCN-NEXT: FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `ptr poison`) + ; GCN-NEXT: S_ENDPGM 0 $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load (s64) from `ptr addrspace(4) poison`, align 4, addrspace 4) $m0 = S_MOV_B32 -1 @@ -98,17 +116,21 @@ body: | ... --- -# GCN-LABEL: name: load_wavefront_unordered -# GCN-LABEL: bb.0: -# GCN-NOT: S_WAITCNT -# GCN: DS_READ_B32 -# GCN-NOT: S_WAITCNT -# GCN: FLAT_STORE_DWORD name: load_wavefront_unordered body: | bb.0: + ; GCN-LABEL: name: load_wavefront_unordered + ; GCN: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) + ; GCN-NEXT: $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load (s64) from `ptr addrspace(4) poison`, align 4, addrspace 4) + ; GCN-NEXT: $m0 = S_MOV_B32 -1 + ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GCN-NEXT: renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load syncscope("wavefront-one-as") unordered (s32) from `ptr addrspace(2) poison`, addrspace 2) + ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 + ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec + ; GCN-NEXT: FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `ptr poison`) + ; GCN-NEXT: S_ENDPGM 0 $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load (s64) from `ptr addrspace(4) poison`, align 4, addrspace 4) $m0 = S_MOV_B32 -1 @@ -122,17 +144,21 @@ body: | ... --- -# GCN-LABEL: name: load_wavefront_monotonic -# GCN-LABEL: bb.0: -# GCN-NOT: S_WAITCNT -# GCN: DS_READ_B32 -# GCN-NOT: S_WAITCNT -# GCN: FLAT_STORE_DWORD name: load_wavefront_monotonic body: | bb.0: + ; GCN-LABEL: name: load_wavefront_monotonic + ; GCN: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) + ; GCN-NEXT: $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load (s64) from `ptr addrspace(4) poison`, align 4, addrspace 4) + ; GCN-NEXT: $m0 = S_MOV_B32 -1 + ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GCN-NEXT: renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load syncscope("wavefront-one-as") monotonic (s32) from `ptr addrspace(2) poison`, addrspace 2) + ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 + ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec + ; GCN-NEXT: FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `ptr poison`) + ; GCN-NEXT: S_ENDPGM 0 $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load (s64) from `ptr addrspace(4) poison`, align 4, addrspace 4) $m0 = S_MOV_B32 -1 @@ -146,17 +172,21 @@ body: | ... --- -# GCN-LABEL: name: load_wavefront_acquire -# GCN-LABEL: bb.0: -# GCN-NOT: S_WAITCNT -# GCN: DS_READ_B32 -# GCN-NOT: S_WAITCNT -# GCN: FLAT_STORE_DWORD name: load_wavefront_acquire body: | bb.0: + ; GCN-LABEL: name: load_wavefront_acquire + ; GCN: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) + ; GCN-NEXT: $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load (s64) from `ptr addrspace(4) poison`, align 4, addrspace 4) + ; GCN-NEXT: $m0 = S_MOV_B32 -1 + ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GCN-NEXT: renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load syncscope("wavefront-one-as") acquire (s32) from `ptr addrspace(2) poison`, addrspace 2) + ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 + ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec + ; GCN-NEXT: FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `ptr poison`) + ; GCN-NEXT: S_ENDPGM 0 $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load (s64) from `ptr addrspace(4) poison`, align 4, addrspace 4) $m0 = S_MOV_B32 -1 @@ -170,17 +200,21 @@ body: | ... --- -# GCN-LABEL: name: load_wavefront_seq_cst -# GCN-LABEL: bb.0: -# GCN-NOT: S_WAITCNT -# GCN: DS_READ_B32 -# GCN-NOT: S_WAITCNT -# GCN: FLAT_STORE_DWORD name: load_wavefront_seq_cst body: | bb.0: + ; GCN-LABEL: name: load_wavefront_seq_cst + ; GCN: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) + ; GCN-NEXT: $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load (s64) from `ptr addrspace(4) poison`, align 4, addrspace 4) + ; GCN-NEXT: $m0 = S_MOV_B32 -1 + ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GCN-NEXT: renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load syncscope("wavefront-one-as") seq_cst (s32) from `ptr addrspace(2) poison`, addrspace 2) + ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 + ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec + ; GCN-NEXT: FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `ptr poison`) + ; GCN-NEXT: S_ENDPGM 0 $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load (s64) from `ptr addrspace(4) poison`, align 4, addrspace 4) $m0 = S_MOV_B32 -1 @@ -194,17 +228,21 @@ body: | ... --- -# GCN-LABEL: name: load_workgroup_unordered -# GCN-LABEL: bb.0: -# GCN-NOT: S_WAITCNT -# GCN: DS_READ_B32 -# GCN-NOT: S_WAITCNT -# GCN: FLAT_STORE_DWORD name: load_workgroup_unordered body: | bb.0: + ; GCN-LABEL: name: load_workgroup_unordered + ; GCN: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) + ; GCN-NEXT: $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load (s64) from `ptr addrspace(4) poison`, align 4, addrspace 4) + ; GCN-NEXT: $m0 = S_MOV_B32 -1 + ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GCN-NEXT: renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load syncscope("workgroup-one-as") unordered (s32) from `ptr addrspace(2) poison`, addrspace 2) + ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 + ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec + ; GCN-NEXT: FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `ptr poison`) + ; GCN-NEXT: S_ENDPGM 0 $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load (s64) from `ptr addrspace(4) poison`, align 4, addrspace 4) $m0 = S_MOV_B32 -1 @@ -218,17 +256,21 @@ body: | ... --- -# GCN-LABEL: name: load_workgroup_monotonic -# GCN-LABEL: bb.0: -# GCN-NOT: S_WAITCNT -# GCN: DS_READ_B32 -# GCN-NOT: S_WAITCNT -# GCN: FLAT_STORE_DWORD name: load_workgroup_monotonic body: | bb.0: + ; GCN-LABEL: name: load_workgroup_monotonic + ; GCN: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) + ; GCN-NEXT: $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load (s64) from `ptr addrspace(4) poison`, align 4, addrspace 4) + ; GCN-NEXT: $m0 = S_MOV_B32 -1 + ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GCN-NEXT: renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load syncscope("workgroup-one-as") monotonic (s32) from `ptr addrspace(2) poison`, addrspace 2) + ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 + ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec + ; GCN-NEXT: FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `ptr poison`) + ; GCN-NEXT: S_ENDPGM 0 $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load (s64) from `ptr addrspace(4) poison`, align 4, addrspace 4) $m0 = S_MOV_B32 -1 @@ -242,17 +284,21 @@ body: | ... --- -# GCN-LABEL: name: load_workgroup_acquire -# GCN-LABEL: bb.0: -# GCN-NOT: S_WAITCNT -# GCN: DS_READ_B32 -# GCN-NOT: S_WAITCNT -# GCN: FLAT_STORE_DWORD name: load_workgroup_acquire body: | bb.0: + ; GCN-LABEL: name: load_workgroup_acquire + ; GCN: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) + ; GCN-NEXT: $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load (s64) from `ptr addrspace(4) poison`, align 4, addrspace 4) + ; GCN-NEXT: $m0 = S_MOV_B32 -1 + ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GCN-NEXT: renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load syncscope("workgroup-one-as") acquire (s32) from `ptr addrspace(2) poison`, addrspace 2) + ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 + ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec + ; GCN-NEXT: FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `ptr poison`) + ; GCN-NEXT: S_ENDPGM 0 $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load (s64) from `ptr addrspace(4) poison`, align 4, addrspace 4) $m0 = S_MOV_B32 -1 @@ -266,17 +312,21 @@ body: | ... --- -# GCN-LABEL: name: load_workgroup_seq_cst -# GCN-LABEL: bb.0: -# GCN-NOT: S_WAITCNT -# GCN: DS_READ_B32 -# GCN-NOT: S_WAITCNT -# GCN: FLAT_STORE_DWORD name: load_workgroup_seq_cst body: | bb.0: + ; GCN-LABEL: name: load_workgroup_seq_cst + ; GCN: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) + ; GCN-NEXT: $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load (s64) from `ptr addrspace(4) poison`, align 4, addrspace 4) + ; GCN-NEXT: $m0 = S_MOV_B32 -1 + ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GCN-NEXT: renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load syncscope("workgroup-one-as") seq_cst (s32) from `ptr addrspace(2) poison`, addrspace 2) + ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 + ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec + ; GCN-NEXT: FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `ptr poison`) + ; GCN-NEXT: S_ENDPGM 0 $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load (s64) from `ptr addrspace(4) poison`, align 4, addrspace 4) $m0 = S_MOV_B32 -1 @@ -290,17 +340,21 @@ body: | ... --- -# GCN-LABEL: name: load_agent_unordered -# GCN-LABEL: bb.0: -# GCN-NOT: S_WAITCNT -# GCN: DS_READ_B32 -# GCN-NOT: S_WAITCNT -# GCN: FLAT_STORE_DWORD name: load_agent_unordered body: | bb.0: + ; GCN-LABEL: name: load_agent_unordered + ; GCN: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) + ; GCN-NEXT: $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load (s64) from `ptr addrspace(4) poison`, align 4, addrspace 4) + ; GCN-NEXT: $m0 = S_MOV_B32 -1 + ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GCN-NEXT: renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load syncscope("agent-one-as") unordered (s32) from `ptr addrspace(2) poison`, addrspace 2) + ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 + ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec + ; GCN-NEXT: FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `ptr poison`) + ; GCN-NEXT: S_ENDPGM 0 $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load (s64) from `ptr addrspace(4) poison`, align 4, addrspace 4) $m0 = S_MOV_B32 -1 @@ -314,17 +368,21 @@ body: | ... --- -# GCN-LABEL: name: load_agent_monotonic -# GCN-LABEL: bb.0: -# GCN-NOT: S_WAITCNT -# GCN: DS_READ_B32 -# GCN-NOT: S_WAITCNT -# GCN: FLAT_STORE_DWORD name: load_agent_monotonic body: | bb.0: + ; GCN-LABEL: name: load_agent_monotonic + ; GCN: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) + ; GCN-NEXT: $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load (s64) from `ptr addrspace(4) poison`, align 4, addrspace 4) + ; GCN-NEXT: $m0 = S_MOV_B32 -1 + ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GCN-NEXT: renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load syncscope("agent-one-as") monotonic (s32) from `ptr addrspace(2) poison`, addrspace 2) + ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 + ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec + ; GCN-NEXT: FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `ptr poison`) + ; GCN-NEXT: S_ENDPGM 0 $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load (s64) from `ptr addrspace(4) poison`, align 4, addrspace 4) $m0 = S_MOV_B32 -1 @@ -338,17 +396,21 @@ body: | ... --- -# GCN-LABEL: name: load_agent_acquire -# GCN-LABEL: bb.0: -# GCN-NOT: S_WAITCNT -# GCN: DS_READ_B32 -# GCN-NOT: S_WAITCNT -# GCN: FLAT_STORE_DWORD name: load_agent_acquire body: | bb.0: + ; GCN-LABEL: name: load_agent_acquire + ; GCN: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) + ; GCN-NEXT: $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load (s64) from `ptr addrspace(4) poison`, align 4, addrspace 4) + ; GCN-NEXT: $m0 = S_MOV_B32 -1 + ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GCN-NEXT: renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load syncscope("agent-one-as") acquire (s32) from `ptr addrspace(2) poison`, addrspace 2) + ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 + ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec + ; GCN-NEXT: FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `ptr poison`) + ; GCN-NEXT: S_ENDPGM 0 $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load (s64) from `ptr addrspace(4) poison`, align 4, addrspace 4) $m0 = S_MOV_B32 -1 @@ -362,17 +424,21 @@ body: | ... --- -# GCN-LABEL: name: load_agent_seq_cst -# GCN-LABEL: bb.0: -# GCN-NOT: S_WAITCNT -# GCN: DS_READ_B32 -# GCN-NOT: S_WAITCNT -# GCN: FLAT_STORE_DWORD name: load_agent_seq_cst body: | bb.0: + ; GCN-LABEL: name: load_agent_seq_cst + ; GCN: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) + ; GCN-NEXT: $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load (s64) from `ptr addrspace(4) poison`, align 4, addrspace 4) + ; GCN-NEXT: $m0 = S_MOV_B32 -1 + ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GCN-NEXT: renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load syncscope("agent-one-as") seq_cst (s32) from `ptr addrspace(2) poison`, addrspace 2) + ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 + ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec + ; GCN-NEXT: FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `ptr poison`) + ; GCN-NEXT: S_ENDPGM 0 $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load (s64) from `ptr addrspace(4) poison`, align 4, addrspace 4) $m0 = S_MOV_B32 -1 @@ -386,17 +452,21 @@ body: | ... --- -# GCN-LABEL: name: load_system_unordered -# GCN-LABEL: bb.0: -# GCN-NOT: S_WAITCNT -# GCN: DS_READ_B32 -# GCN-NOT: S_WAITCNT -# GCN: FLAT_STORE_DWORD name: load_system_unordered body: | bb.0: + ; GCN-LABEL: name: load_system_unordered + ; GCN: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) + ; GCN-NEXT: $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load (s64) from `ptr addrspace(4) poison`, align 4, addrspace 4) + ; GCN-NEXT: $m0 = S_MOV_B32 -1 + ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GCN-NEXT: renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load syncscope("one-as") unordered (s32) from `ptr addrspace(2) poison`, addrspace 2) + ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 + ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec + ; GCN-NEXT: FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `ptr poison`) + ; GCN-NEXT: S_ENDPGM 0 $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load (s64) from `ptr addrspace(4) poison`, align 4, addrspace 4) $m0 = S_MOV_B32 -1 @@ -410,17 +480,21 @@ body: | ... --- -# GCN-LABEL: name: load_system_monotonic -# GCN-LABEL: bb.0: -# GCN-NOT: S_WAITCNT -# GCN: DS_READ_B32 -# GCN-NOT: S_WAITCNT -# GCN: FLAT_STORE_DWORD name: load_system_monotonic body: | bb.0: + ; GCN-LABEL: name: load_system_monotonic + ; GCN: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) + ; GCN-NEXT: $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load (s64) from `ptr addrspace(4) poison`, align 4, addrspace 4) + ; GCN-NEXT: $m0 = S_MOV_B32 -1 + ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GCN-NEXT: renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load syncscope("one-as") monotonic (s32) from `ptr addrspace(2) poison`, addrspace 2) + ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 + ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec + ; GCN-NEXT: FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `ptr poison`) + ; GCN-NEXT: S_ENDPGM 0 $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load (s64) from `ptr addrspace(4) poison`, align 4, addrspace 4) $m0 = S_MOV_B32 -1 @@ -434,17 +508,21 @@ body: | ... --- -# GCN-LABEL: name: load_system_acquire -# GCN-LABEL: bb.0: -# GCN-NOT: S_WAITCNT -# GCN: DS_READ_B32 -# GCN-NOT: S_WAITCNT -# GCN: FLAT_STORE_DWORD name: load_system_acquire body: | bb.0: + ; GCN-LABEL: name: load_system_acquire + ; GCN: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) + ; GCN-NEXT: $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load (s64) from `ptr addrspace(4) poison`, align 4, addrspace 4) + ; GCN-NEXT: $m0 = S_MOV_B32 -1 + ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GCN-NEXT: renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load syncscope("one-as") acquire (s32) from `ptr addrspace(2) poison`, addrspace 2) + ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 + ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec + ; GCN-NEXT: FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `ptr poison`) + ; GCN-NEXT: S_ENDPGM 0 $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load (s64) from `ptr addrspace(4) poison`, align 4, addrspace 4) $m0 = S_MOV_B32 -1 @@ -458,17 +536,21 @@ body: | ... --- -# GCN-LABEL: name: load_system_seq_cst -# GCN-LABEL: bb.0: -# GCN-NOT: S_WAITCNT -# GCN: DS_READ_B32 -# GCN-NOT: S_WAITCNT -# GCN: FLAT_STORE_DWORD name: load_system_seq_cst body: | bb.0: + ; GCN-LABEL: name: load_system_seq_cst + ; GCN: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) + ; GCN-NEXT: $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load (s64) from `ptr addrspace(4) poison`, align 4, addrspace 4) + ; GCN-NEXT: $m0 = S_MOV_B32 -1 + ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GCN-NEXT: renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load syncscope("one-as") seq_cst (s32) from `ptr addrspace(2) poison`, addrspace 2) + ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 + ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec + ; GCN-NEXT: FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `ptr poison`) + ; GCN-NEXT: S_ENDPGM 0 $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load (s64) from `ptr addrspace(4) poison`, align 4, addrspace 4) $m0 = S_MOV_B32 -1 @@ -482,17 +564,19 @@ body: | ... --- -# GCN-LABEL: name: store_singlethread_unordered -# GCN-LABEL: bb.0: -# GCN-NOT: S_WAITCNT -# GCN: DS_WRITE_B32 -# GCN-NOT: S_WAITCNT -# GCN: S_ENDPGM 0 name: store_singlethread_unordered body: | bb.0: + ; GCN-LABEL: name: store_singlethread_unordered + ; GCN: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) + ; GCN-NEXT: $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, align 8, addrspace 4) + ; GCN-NEXT: $m0 = S_MOV_B32 -1 + ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GCN-NEXT: DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("singlethread-one-as") unordered (s32) into `ptr addrspace(2) poison`, addrspace 2) + ; GCN-NEXT: S_ENDPGM 0 $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, align 8, addrspace 4) $m0 = S_MOV_B32 -1 @@ -504,17 +588,19 @@ body: | ... --- -# GCN-LABEL: name: store_singlethread_monotonic -# GCN-LABEL: bb.0: -# GCN-NOT: S_WAITCNT -# GCN: DS_WRITE_B32 -# GCN-NOT: S_WAITCNT -# GCN: S_ENDPGM 0 name: store_singlethread_monotonic body: | bb.0: + ; GCN-LABEL: name: store_singlethread_monotonic + ; GCN: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) + ; GCN-NEXT: $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, align 8, addrspace 4) + ; GCN-NEXT: $m0 = S_MOV_B32 -1 + ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GCN-NEXT: DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("singlethread-one-as") monotonic (s32) into `ptr addrspace(2) poison`, addrspace 2) + ; GCN-NEXT: S_ENDPGM 0 $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, align 8, addrspace 4) $m0 = S_MOV_B32 -1 @@ -526,17 +612,20 @@ body: | ... --- -# GCN-LABEL: name: store_singlethread_release -# GCN-LABEL: bb.0: -# GCN-NOT: S_WAITCNT -# GCN: DS_WRITE_B32 -# GCN-NOT: S_WAITCNT -# GCN: S_ENDPGM 0 name: store_singlethread_release body: | bb.0: + ; GCN-LABEL: name: store_singlethread_release + ; GCN: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) + ; GCN-NEXT: $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, align 8, addrspace 4) + ; GCN-NEXT: $m0 = S_MOV_B32 -1 + ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GCN-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952 + ; GCN-NEXT: DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("singlethread-one-as") release (s32) into `ptr addrspace(2) poison`, addrspace 2) + ; GCN-NEXT: S_ENDPGM 0 $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, align 8, addrspace 4) $m0 = S_MOV_B32 -1 @@ -548,17 +637,20 @@ body: | ... --- -# GCN-LABEL: name: store_singlethread_seq_cst -# GCN-LABEL: bb.0: -# GCN-NOT: S_WAITCNT -# GCN: DS_WRITE_B32 -# GCN-NOT: S_WAITCNT -# GCN: S_ENDPGM 0 name: store_singlethread_seq_cst body: | bb.0: + ; GCN-LABEL: name: store_singlethread_seq_cst + ; GCN: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) + ; GCN-NEXT: $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, align 8, addrspace 4) + ; GCN-NEXT: $m0 = S_MOV_B32 -1 + ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GCN-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952 + ; GCN-NEXT: DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("singlethread-one-as") seq_cst (s32) into `ptr addrspace(2) poison`, addrspace 2) + ; GCN-NEXT: S_ENDPGM 0 $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, align 8, addrspace 4) $m0 = S_MOV_B32 -1 @@ -570,17 +662,19 @@ body: | ... --- -# GCN-LABEL: name: store_wavefront_unordered -# GCN-LABEL: bb.0: -# GCN-NOT: S_WAITCNT -# GCN: DS_WRITE_B32 -# GCN-NOT: S_WAITCNT -# GCN: S_ENDPGM 0 name: store_wavefront_unordered body: | bb.0: + ; GCN-LABEL: name: store_wavefront_unordered + ; GCN: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) + ; GCN-NEXT: $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, align 8, addrspace 4) + ; GCN-NEXT: $m0 = S_MOV_B32 -1 + ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GCN-NEXT: DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("wavefront-one-as") unordered (s32) into `ptr addrspace(2) poison`, addrspace 2) + ; GCN-NEXT: S_ENDPGM 0 $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, align 8, addrspace 4) $m0 = S_MOV_B32 -1 @@ -592,17 +686,19 @@ body: | ... --- -# GCN-LABEL: name: store_wavefront_monotonic -# GCN-LABEL: bb.0: -# GCN-NOT: S_WAITCNT -# GCN: DS_WRITE_B32 -# GCN-NOT: S_WAITCNT -# GCN: S_ENDPGM 0 name: store_wavefront_monotonic body: | bb.0: + ; GCN-LABEL: name: store_wavefront_monotonic + ; GCN: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) + ; GCN-NEXT: $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, align 8, addrspace 4) + ; GCN-NEXT: $m0 = S_MOV_B32 -1 + ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GCN-NEXT: DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("wavefront-one-as") monotonic (s32) into `ptr addrspace(2) poison`, addrspace 2) + ; GCN-NEXT: S_ENDPGM 0 $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, align 8, addrspace 4) $m0 = S_MOV_B32 -1 @@ -614,17 +710,19 @@ body: | ... --- -# GCN-LABEL: name: store_wavefront_release -# GCN-LABEL: bb.0: -# GCN-NOT: S_WAITCNT -# GCN: DS_WRITE_B32 -# GCN-NOT: S_WAITCNT -# GCN: S_ENDPGM 0 name: store_wavefront_release body: | bb.0: + ; GCN-LABEL: name: store_wavefront_release + ; GCN: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) + ; GCN-NEXT: $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, align 8, addrspace 4) + ; GCN-NEXT: $m0 = S_MOV_B32 -1 + ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GCN-NEXT: DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("wavefront-one-as") release (s32) into `ptr addrspace(2) poison`, addrspace 2) + ; GCN-NEXT: S_ENDPGM 0 $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, align 8, addrspace 4) $m0 = S_MOV_B32 -1 @@ -636,17 +734,19 @@ body: | ... --- -# GCN-LABEL: name: store_wavefront_seq_cst -# GCN-LABEL: bb.0: -# GCN-NOT: S_WAITCNT -# GCN: DS_WRITE_B32 -# GCN-NOT: S_WAITCNT -# GCN: S_ENDPGM 0 name: store_wavefront_seq_cst body: | bb.0: + ; GCN-LABEL: name: store_wavefront_seq_cst + ; GCN: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) + ; GCN-NEXT: $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, align 8, addrspace 4) + ; GCN-NEXT: $m0 = S_MOV_B32 -1 + ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GCN-NEXT: DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("wavefront-one-as") seq_cst (s32) into `ptr addrspace(2) poison`, addrspace 2) + ; GCN-NEXT: S_ENDPGM 0 $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, align 8, addrspace 4) $m0 = S_MOV_B32 -1 @@ -658,17 +758,19 @@ body: | ... --- -# GCN-LABEL: name: store_workgroup_unordered -# GCN-LABEL: bb.0: -# GCN-NOT: S_WAITCNT -# GCN: DS_WRITE_B32 -# GCN-NOT: S_WAITCNT -# GCN: S_ENDPGM 0 name: store_workgroup_unordered body: | bb.0: + ; GCN-LABEL: name: store_workgroup_unordered + ; GCN: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) + ; GCN-NEXT: $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, align 8, addrspace 4) + ; GCN-NEXT: $m0 = S_MOV_B32 -1 + ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GCN-NEXT: DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("workgroup-one-as") unordered (s32) into `ptr addrspace(2) poison`, addrspace 2) + ; GCN-NEXT: S_ENDPGM 0 $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, align 8, addrspace 4) $m0 = S_MOV_B32 -1 @@ -680,17 +782,19 @@ body: | ... --- -# GCN-LABEL: name: store_workgroup_monotonic -# GCN-LABEL: bb.0: -# GCN-NOT: S_WAITCNT -# GCN: DS_WRITE_B32 -# GCN-NOT: S_WAITCNT -# GCN: S_ENDPGM 0 name: store_workgroup_monotonic body: | bb.0: + ; GCN-LABEL: name: store_workgroup_monotonic + ; GCN: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) + ; GCN-NEXT: $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, align 8, addrspace 4) + ; GCN-NEXT: $m0 = S_MOV_B32 -1 + ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GCN-NEXT: DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("workgroup-one-as") monotonic (s32) into `ptr addrspace(2) poison`, addrspace 2) + ; GCN-NEXT: S_ENDPGM 0 $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, align 8, addrspace 4) $m0 = S_MOV_B32 -1 @@ -702,17 +806,19 @@ body: | ... --- -# GCN-LABEL: name: store_workgroup_release -# GCN-LABEL: bb.0: -# GCN-NOT: S_WAITCNT -# GCN: DS_WRITE_B32 -# GCN-NOT: S_WAITCNT -# GCN: S_ENDPGM 0 name: store_workgroup_release body: | bb.0: + ; GCN-LABEL: name: store_workgroup_release + ; GCN: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) + ; GCN-NEXT: $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, align 8, addrspace 4) + ; GCN-NEXT: $m0 = S_MOV_B32 -1 + ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GCN-NEXT: DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("workgroup-one-as") release (s32) into `ptr addrspace(2) poison`, addrspace 2) + ; GCN-NEXT: S_ENDPGM 0 $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, align 8, addrspace 4) $m0 = S_MOV_B32 -1 @@ -724,17 +830,19 @@ body: | ... --- -# GCN-LABEL: name: store_workgroup_seq_cst -# GCN-LABEL: bb.0: -# GCN-NOT: S_WAITCNT -# GCN: DS_WRITE_B32 -# GCN-NOT: S_WAITCNT -# GCN: S_ENDPGM 0 name: store_workgroup_seq_cst body: | bb.0: + ; GCN-LABEL: name: store_workgroup_seq_cst + ; GCN: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) + ; GCN-NEXT: $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, align 8, addrspace 4) + ; GCN-NEXT: $m0 = S_MOV_B32 -1 + ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GCN-NEXT: DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("workgroup-one-as") seq_cst (s32) into `ptr addrspace(2) poison`, addrspace 2) + ; GCN-NEXT: S_ENDPGM 0 $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, align 8, addrspace 4) $m0 = S_MOV_B32 -1 @@ -746,17 +854,19 @@ body: | ... --- -# GCN-LABEL: name: store_agent_unordered -# GCN-LABEL: bb.0: -# GCN-NOT: S_WAITCNT -# GCN: DS_WRITE_B32 -# GCN-NOT: S_WAITCNT -# GCN: S_ENDPGM 0 name: store_agent_unordered body: | bb.0: + ; GCN-LABEL: name: store_agent_unordered + ; GCN: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) + ; GCN-NEXT: $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, align 8, addrspace 4) + ; GCN-NEXT: $m0 = S_MOV_B32 -1 + ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GCN-NEXT: DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("agent-one-as") unordered (s32) into `ptr addrspace(2) poison`, addrspace 2) + ; GCN-NEXT: S_ENDPGM 0 $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, align 8, addrspace 4) $m0 = S_MOV_B32 -1 @@ -768,17 +878,19 @@ body: | ... --- -# GCN-LABEL: name: store_agent_monotonic -# GCN-LABEL: bb.0: -# GCN-NOT: S_WAITCNT -# GCN: DS_WRITE_B32 -# GCN-NOT: S_WAITCNT -# GCN: S_ENDPGM 0 name: store_agent_monotonic body: | bb.0: + ; GCN-LABEL: name: store_agent_monotonic + ; GCN: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) + ; GCN-NEXT: $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, align 8, addrspace 4) + ; GCN-NEXT: $m0 = S_MOV_B32 -1 + ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GCN-NEXT: DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("agent-one-as") monotonic (s32) into `ptr addrspace(2) poison`, addrspace 2) + ; GCN-NEXT: S_ENDPGM 0 $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, align 8, addrspace 4) $m0 = S_MOV_B32 -1 @@ -790,17 +902,19 @@ body: | ... --- -# GCN-LABEL: name: store_agent_release -# GCN-LABEL: bb.0: -# GCN-NOT: S_WAITCNT -# GCN: DS_WRITE_B32 -# GCN-NOT: S_WAITCNT -# GCN: S_ENDPGM 0 name: store_agent_release body: | bb.0: + ; GCN-LABEL: name: store_agent_release + ; GCN: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) + ; GCN-NEXT: $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, align 8, addrspace 4) + ; GCN-NEXT: $m0 = S_MOV_B32 -1 + ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GCN-NEXT: DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("agent-one-as") release (s32) into `ptr addrspace(2) poison`, addrspace 2) + ; GCN-NEXT: S_ENDPGM 0 $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, align 8, addrspace 4) $m0 = S_MOV_B32 -1 @@ -812,17 +926,19 @@ body: | ... --- -# GCN-LABEL: name: store_agent_seq_cst -# GCN-LABEL: bb.0: -# GCN-NOT: S_WAITCNT -# GCN: DS_WRITE_B32 -# GCN-NOT: S_WAITCNT -# GCN: S_ENDPGM 0 name: store_agent_seq_cst body: | bb.0: + ; GCN-LABEL: name: store_agent_seq_cst + ; GCN: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) + ; GCN-NEXT: $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, align 8, addrspace 4) + ; GCN-NEXT: $m0 = S_MOV_B32 -1 + ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GCN-NEXT: DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("agent-one-as") seq_cst (s32) into `ptr addrspace(2) poison`, addrspace 2) + ; GCN-NEXT: S_ENDPGM 0 $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, align 8, addrspace 4) $m0 = S_MOV_B32 -1 @@ -834,17 +950,19 @@ body: | ... --- -# GCN-LABEL: name: store_system_unordered -# GCN-LABEL: bb.0: -# GCN-NOT: S_WAITCNT -# GCN: DS_WRITE_B32 -# GCN-NOT: S_WAITCNT -# GCN: S_ENDPGM 0 name: store_system_unordered body: | bb.0: + ; GCN-LABEL: name: store_system_unordered + ; GCN: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) + ; GCN-NEXT: $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, align 8, addrspace 4) + ; GCN-NEXT: $m0 = S_MOV_B32 -1 + ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GCN-NEXT: DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store unordered (s32) into `ptr addrspace(2) poison`, addrspace 2) + ; GCN-NEXT: S_ENDPGM 0 $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, align 8, addrspace 4) $m0 = S_MOV_B32 -1 @@ -856,17 +974,19 @@ body: | ... --- -# GCN-LABEL: name: store_system_monotonic -# GCN-LABEL: bb.0: -# GCN-NOT: S_WAITCNT -# GCN: DS_WRITE_B32 -# GCN-NOT: S_WAITCNT -# GCN: S_ENDPGM 0 name: store_system_monotonic body: | bb.0: + ; GCN-LABEL: name: store_system_monotonic + ; GCN: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) + ; GCN-NEXT: $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, align 8, addrspace 4) + ; GCN-NEXT: $m0 = S_MOV_B32 -1 + ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GCN-NEXT: DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store monotonic (s32) into `ptr addrspace(2) poison`, addrspace 2) + ; GCN-NEXT: S_ENDPGM 0 $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, align 8, addrspace 4) $m0 = S_MOV_B32 -1 @@ -878,17 +998,19 @@ body: | ... --- -# GCN-LABEL: name: store_system_release -# GCN-LABEL: bb.0: -# GCN-NOT: S_WAITCNT -# GCN: DS_WRITE_B32 -# GCN-NOT: S_WAITCNT -# GCN: S_ENDPGM 0 name: store_system_release body: | bb.0: + ; GCN-LABEL: name: store_system_release + ; GCN: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) + ; GCN-NEXT: $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, align 8, addrspace 4) + ; GCN-NEXT: $m0 = S_MOV_B32 -1 + ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GCN-NEXT: DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("one-as") release (s32) into `ptr addrspace(2) poison`, addrspace 2) + ; GCN-NEXT: S_ENDPGM 0 $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, align 8, addrspace 4) $m0 = S_MOV_B32 -1 @@ -900,17 +1022,19 @@ body: | ... --- -# GCN-LABEL: name: store_system_seq_cst -# GCN-LABEL: bb.0: -# GCN-NOT: S_WAITCNT -# GCN: DS_WRITE_B32 -# GCN-NOT: S_WAITCNT -# GCN: S_ENDPGM 0 name: store_system_seq_cst body: | bb.0: + ; GCN-LABEL: name: store_system_seq_cst + ; GCN: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) + ; GCN-NEXT: $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, align 8, addrspace 4) + ; GCN-NEXT: $m0 = S_MOV_B32 -1 + ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GCN-NEXT: DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("one-as") seq_cst (s32) into `ptr addrspace(2) poison`, addrspace 2) + ; GCN-NEXT: S_ENDPGM 0 $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, align 8, addrspace 4) $m0 = S_MOV_B32 -1 @@ -922,17 +1046,19 @@ body: | ... --- -# GCN-LABEL: name: atomicrmw_singlethread_unordered -# GCN-LABEL: bb.0: -# GCN-NOT: S_WAITCNT -# GCN: DS_WRXCHG_RTN_B32 -# GCN-NOT: S_WAITCNT -# GCN: S_ENDPGM 0 name: atomicrmw_singlethread_unordered body: | bb.0: + ; GCN-LABEL: name: atomicrmw_singlethread_unordered + ; GCN: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) + ; GCN-NEXT: $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, align 8, addrspace 4) + ; GCN-NEXT: $m0 = S_MOV_B32 -1 + ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GCN-NEXT: $vgpr2 = DS_WRXCHG_RTN_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("singlethread-one-as") unordered (s32) into `ptr addrspace(2) poison`, addrspace 2) + ; GCN-NEXT: S_ENDPGM 0 $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, align 8, addrspace 4) $m0 = S_MOV_B32 -1 @@ -944,17 +1070,19 @@ body: | ... --- -# GCN-LABEL: name: atomicrmw_singlethread_monotonic -# GCN-LABEL: bb.0: -# GCN-NOT: S_WAITCNT -# GCN: DS_WRXCHG_RTN_B32 -# GCN-NOT: S_WAITCNT -# GCN: S_ENDPGM 0 name: atomicrmw_singlethread_monotonic body: | bb.0: + ; GCN-LABEL: name: atomicrmw_singlethread_monotonic + ; GCN: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) + ; GCN-NEXT: $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, align 8, addrspace 4) + ; GCN-NEXT: $m0 = S_MOV_B32 -1 + ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GCN-NEXT: $vgpr2 = DS_WRXCHG_RTN_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("singlethread-one-as") monotonic (s32) into `ptr addrspace(2) poison`, addrspace 2) + ; GCN-NEXT: S_ENDPGM 0 $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, align 8, addrspace 4) $m0 = S_MOV_B32 -1 @@ -966,17 +1094,19 @@ body: | ... --- -# GCN-LABEL: name: atomicrmw_singlethread_acquire -# GCN-LABEL: bb.0: -# GCN-NOT: S_WAITCNT -# GCN: DS_WRXCHG_RTN_B32 -# GCN-NOT: S_WAITCNT -# GCN: S_ENDPGM 0 name: atomicrmw_singlethread_acquire body: | bb.0: + ; GCN-LABEL: name: atomicrmw_singlethread_acquire + ; GCN: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) + ; GCN-NEXT: $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, align 8, addrspace 4) + ; GCN-NEXT: $m0 = S_MOV_B32 -1 + ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GCN-NEXT: $vgpr2 = DS_WRXCHG_RTN_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("singlethread-one-as") acquire (s32) into `ptr addrspace(2) poison`, addrspace 2) + ; GCN-NEXT: S_ENDPGM 0 $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, align 8, addrspace 4) $m0 = S_MOV_B32 -1 @@ -988,17 +1118,20 @@ body: | ... --- -# GCN-LABEL: name: atomicrmw_singlethread_release -# GCN-LABEL: bb.0: -# GCN-NOT: S_WAITCNT -# GCN: DS_WRXCHG_RTN_B32 -# GCN-NOT: S_WAITCNT -# GCN: S_ENDPGM 0 name: atomicrmw_singlethread_release body: | bb.0: + ; GCN-LABEL: name: atomicrmw_singlethread_release + ; GCN: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) + ; GCN-NEXT: $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, align 8, addrspace 4) + ; GCN-NEXT: $m0 = S_MOV_B32 -1 + ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GCN-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952 + ; GCN-NEXT: $vgpr2 = DS_WRXCHG_RTN_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("singlethread-one-as") release (s32) into `ptr addrspace(2) poison`, addrspace 2) + ; GCN-NEXT: S_ENDPGM 0 $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, align 8, addrspace 4) $m0 = S_MOV_B32 -1 @@ -1010,17 +1143,20 @@ body: | ... --- -# GCN-LABEL: name: atomicrmw_singlethread_acq_rel -# GCN-LABEL: bb.0: -# GCN-NOT: S_WAITCNT -# GCN: DS_WRXCHG_RTN_B32 -# GCN-NOT: S_WAITCNT -# GCN: S_ENDPGM 0 name: atomicrmw_singlethread_acq_rel body: | bb.0: + ; GCN-LABEL: name: atomicrmw_singlethread_acq_rel + ; GCN: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) + ; GCN-NEXT: $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, align 8, addrspace 4) + ; GCN-NEXT: $m0 = S_MOV_B32 -1 + ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GCN-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952 + ; GCN-NEXT: $vgpr2 = DS_WRXCHG_RTN_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("singlethread-one-as") acq_rel (s32) into `ptr addrspace(2) poison`, addrspace 2) + ; GCN-NEXT: S_ENDPGM 0 $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, align 8, addrspace 4) $m0 = S_MOV_B32 -1 @@ -1032,17 +1168,20 @@ body: | ... --- -# GCN-LABEL: name: atomicrmw_singlethread_seq_cst -# GCN-LABEL: bb.0: -# GCN-NOT: S_WAITCNT -# GCN: DS_WRXCHG_RTN_B32 -# GCN-NOT: S_WAITCNT -# GCN: S_ENDPGM 0 name: atomicrmw_singlethread_seq_cst body: | bb.0: + ; GCN-LABEL: name: atomicrmw_singlethread_seq_cst + ; GCN: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) + ; GCN-NEXT: $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, align 8, addrspace 4) + ; GCN-NEXT: $m0 = S_MOV_B32 -1 + ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GCN-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952 + ; GCN-NEXT: $vgpr2 = DS_WRXCHG_RTN_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("singlethread-one-as") seq_cst (s32) into `ptr addrspace(2) poison`, addrspace 2) + ; GCN-NEXT: S_ENDPGM 0 $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, align 8, addrspace 4) $m0 = S_MOV_B32 -1