diff --git a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp index 56fec409d11ae..520112367e997 100644 --- a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp +++ b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp @@ -1084,6 +1084,7 @@ bool SIGfx6CacheControl::insertWait(MachineBasicBlock::iterator &MI, bool VMCnt = false; bool LGKMCnt = false; + bool DirectLDSWait = false; if ((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH)) != SIAtomicAddrSpace::NONE) { @@ -1104,6 +1105,10 @@ bool SIGfx6CacheControl::insertWait(MachineBasicBlock::iterator &MI, } if ((AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) { + // Wait for direct loads to LDS from global memory to ensure that + // LDS operations cannot be reordered with respect to global memory + // operations. + DirectLDSWait = true; switch (Scope) { case SIAtomicScope::SYSTEM: case SIAtomicScope::AGENT: @@ -1149,6 +1154,18 @@ bool SIGfx6CacheControl::insertWait(MachineBasicBlock::iterator &MI, } } + // Conservatively wait for vmcnt(0) to ensure that LDS operations and direct + // LDS loads from global memory cannot be reordered with respect to each + // other. This waitcnt can be safely optimized to wait for a higher vmcnt + // based on the number of outstanding direct LDS loads. + if (DirectLDSWait) { + unsigned WaitCntImmediate = AMDGPU::encodeWaitcnt( + IV, 0, getExpcntBitMask(IV), getLgkmcntBitMask(IV)); + BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_DIRECT_LDS_LOAD_soft)) + .addImm(WaitCntImmediate); + Changed = true; + } + if (VMCnt || LGKMCnt) { unsigned WaitCntImmediate = AMDGPU::encodeWaitcnt(IV, diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/memory-legalizer-atomic-fence.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/memory-legalizer-atomic-fence.ll index 66037615f0ba0..7f197b3580042 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/memory-legalizer-atomic-fence.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/memory-legalizer-atomic-fence.ll @@ -13,12 +13,14 @@ define amdgpu_kernel void @system_one_as_acquire() #0 { ; GFX6-LABEL: name: system_one_as_acquire ; GFX6: bb.0.entry: + ; GFX6-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952 ; GFX6-NEXT: S_WAITCNT_soft 3952 ; GFX6-NEXT: BUFFER_WBINVL1 implicit $exec ; GFX6-NEXT: S_ENDPGM 0 ; ; GFX8-LABEL: name: system_one_as_acquire ; GFX8: bb.0.entry: + ; GFX8-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952 ; GFX8-NEXT: S_WAITCNT_soft 3952 ; GFX8-NEXT: BUFFER_WBINVL1_VOL implicit $exec ; GFX8-NEXT: S_ENDPGM 0 @@ -62,11 +64,13 @@ entry: define amdgpu_kernel void @system_one_as_release() #0 { ; GFX6-LABEL: name: system_one_as_release ; GFX6: bb.0.entry: + ; GFX6-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952 ; GFX6-NEXT: S_WAITCNT_soft 3952 ; GFX6-NEXT: S_ENDPGM 0 ; ; GFX8-LABEL: name: system_one_as_release ; GFX8: bb.0.entry: + ; GFX8-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952 ; GFX8-NEXT: S_WAITCNT_soft 3952 ; GFX8-NEXT: S_ENDPGM 0 ; @@ -101,12 +105,14 @@ entry: define amdgpu_kernel void @system_one_as_acq_rel() #0 { ; GFX6-LABEL: name: system_one_as_acq_rel ; GFX6: bb.0.entry: + ; GFX6-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952 ; GFX6-NEXT: S_WAITCNT_soft 3952 ; GFX6-NEXT: BUFFER_WBINVL1 implicit $exec ; GFX6-NEXT: S_ENDPGM 0 ; ; GFX8-LABEL: name: system_one_as_acq_rel ; GFX8: bb.0.entry: + ; GFX8-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952 ; GFX8-NEXT: S_WAITCNT_soft 3952 ; GFX8-NEXT: BUFFER_WBINVL1_VOL implicit $exec ; GFX8-NEXT: S_ENDPGM 0 @@ -150,12 +156,14 @@ entry: define amdgpu_kernel void @system_one_as_seq_cst() #0 { ; GFX6-LABEL: name: system_one_as_seq_cst ; GFX6: bb.0.entry: + ; GFX6-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952 ; GFX6-NEXT: S_WAITCNT_soft 3952 ; GFX6-NEXT: BUFFER_WBINVL1 implicit $exec ; GFX6-NEXT: S_ENDPGM 0 ; ; GFX8-LABEL: name: system_one_as_seq_cst ; GFX8: bb.0.entry: + ; GFX8-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952 ; GFX8-NEXT: S_WAITCNT_soft 3952 ; GFX8-NEXT: BUFFER_WBINVL1_VOL implicit $exec ; GFX8-NEXT: S_ENDPGM 0 @@ -199,10 +207,12 @@ entry: define amdgpu_kernel void @singlethread_one_as_acquire() #0 { ; GFX6-LABEL: name: singlethread_one_as_acquire ; GFX6: bb.0.entry: + ; GFX6-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952 ; GFX6-NEXT: S_ENDPGM 0 ; ; GFX8-LABEL: name: singlethread_one_as_acquire ; GFX8: bb.0.entry: + ; GFX8-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952 ; GFX8-NEXT: S_ENDPGM 0 ; ; GFX10WGP-LABEL: name: singlethread_one_as_acquire @@ -228,10 +238,12 @@ entry: define amdgpu_kernel void @singlethread_one_as_release() #0 { ; GFX6-LABEL: name: singlethread_one_as_release ; GFX6: bb.0.entry: + ; GFX6-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952 ; GFX6-NEXT: S_ENDPGM 0 ; ; GFX8-LABEL: name: singlethread_one_as_release ; GFX8: bb.0.entry: + ; GFX8-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952 ; GFX8-NEXT: S_ENDPGM 0 ; ; GFX10WGP-LABEL: name: singlethread_one_as_release @@ -257,10 +269,12 @@ entry: define amdgpu_kernel void @singlethread_one_as_acq_rel() #0 { ; GFX6-LABEL: name: singlethread_one_as_acq_rel ; GFX6: bb.0.entry: + ; GFX6-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952 ; GFX6-NEXT: S_ENDPGM 0 ; ; GFX8-LABEL: name: singlethread_one_as_acq_rel ; GFX8: bb.0.entry: + ; GFX8-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952 ; GFX8-NEXT: S_ENDPGM 0 ; ; GFX10WGP-LABEL: name: singlethread_one_as_acq_rel @@ -286,10 +300,12 @@ entry: define amdgpu_kernel void @singlethread_one_as_seq_cst() #0 { ; GFX6-LABEL: name: singlethread_one_as_seq_cst ; GFX6: bb.0.entry: + ; GFX6-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952 ; GFX6-NEXT: S_ENDPGM 0 ; ; GFX8-LABEL: name: singlethread_one_as_seq_cst ; GFX8: bb.0.entry: + ; GFX8-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952 ; GFX8-NEXT: S_ENDPGM 0 ; ; GFX10WGP-LABEL: name: singlethread_one_as_seq_cst @@ -315,12 +331,14 @@ entry: define amdgpu_kernel void @agent_one_as_acquire() #0 { ; GFX6-LABEL: name: agent_one_as_acquire ; GFX6: bb.0.entry: + ; GFX6-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952 ; GFX6-NEXT: S_WAITCNT_soft 3952 ; GFX6-NEXT: BUFFER_WBINVL1 implicit $exec ; GFX6-NEXT: S_ENDPGM 0 ; ; GFX8-LABEL: name: agent_one_as_acquire ; GFX8: bb.0.entry: + ; GFX8-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952 ; GFX8-NEXT: S_WAITCNT_soft 3952 ; GFX8-NEXT: BUFFER_WBINVL1_VOL implicit $exec ; GFX8-NEXT: S_ENDPGM 0 @@ -364,11 +382,13 @@ entry: define amdgpu_kernel void @agent_one_as_release() #0 { ; GFX6-LABEL: name: agent_one_as_release ; GFX6: bb.0.entry: + ; GFX6-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952 ; GFX6-NEXT: S_WAITCNT_soft 3952 ; GFX6-NEXT: S_ENDPGM 0 ; ; GFX8-LABEL: name: agent_one_as_release ; GFX8: bb.0.entry: + ; GFX8-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952 ; GFX8-NEXT: S_WAITCNT_soft 3952 ; GFX8-NEXT: S_ENDPGM 0 ; @@ -403,12 +423,14 @@ entry: define amdgpu_kernel void @agent_one_as_acq_rel() #0 { ; GFX6-LABEL: name: agent_one_as_acq_rel ; GFX6: bb.0.entry: + ; GFX6-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952 ; GFX6-NEXT: S_WAITCNT_soft 3952 ; GFX6-NEXT: BUFFER_WBINVL1 implicit $exec ; GFX6-NEXT: S_ENDPGM 0 ; ; GFX8-LABEL: name: agent_one_as_acq_rel ; GFX8: bb.0.entry: + ; GFX8-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952 ; GFX8-NEXT: S_WAITCNT_soft 3952 ; GFX8-NEXT: BUFFER_WBINVL1_VOL implicit $exec ; GFX8-NEXT: S_ENDPGM 0 @@ -452,12 +474,14 @@ entry: define amdgpu_kernel void @agent_one_as_seq_cst() #0 { ; GFX6-LABEL: name: agent_one_as_seq_cst ; GFX6: bb.0.entry: + ; GFX6-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952 ; GFX6-NEXT: S_WAITCNT_soft 3952 ; GFX6-NEXT: BUFFER_WBINVL1 implicit $exec ; GFX6-NEXT: S_ENDPGM 0 ; ; GFX8-LABEL: name: agent_one_as_seq_cst ; GFX8: bb.0.entry: + ; GFX8-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952 ; GFX8-NEXT: S_WAITCNT_soft 3952 ; GFX8-NEXT: BUFFER_WBINVL1_VOL implicit $exec ; GFX8-NEXT: S_ENDPGM 0 @@ -501,10 +525,12 @@ entry: define amdgpu_kernel void @workgroup_one_as_acquire() #0 { ; GFX6-LABEL: name: workgroup_one_as_acquire ; GFX6: bb.0.entry: + ; GFX6-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952 ; GFX6-NEXT: S_ENDPGM 0 ; ; GFX8-LABEL: name: workgroup_one_as_acquire ; GFX8: bb.0.entry: + ; GFX8-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952 ; GFX8-NEXT: S_ENDPGM 0 ; ; GFX10WGP-LABEL: name: workgroup_one_as_acquire @@ -536,10 +562,12 @@ entry: define amdgpu_kernel void @workgroup_one_as_release() #0 { ; GFX6-LABEL: name: workgroup_one_as_release ; GFX6: bb.0.entry: + ; GFX6-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952 ; GFX6-NEXT: S_ENDPGM 0 ; ; GFX8-LABEL: name: workgroup_one_as_release ; GFX8: bb.0.entry: + ; GFX8-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952 ; GFX8-NEXT: S_ENDPGM 0 ; ; GFX10WGP-LABEL: name: workgroup_one_as_release @@ -569,10 +597,12 @@ entry: define amdgpu_kernel void @workgroup_one_as_acq_rel() #0 { ; GFX6-LABEL: name: workgroup_one_as_acq_rel ; GFX6: bb.0.entry: + ; GFX6-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952 ; GFX6-NEXT: S_ENDPGM 0 ; ; GFX8-LABEL: name: workgroup_one_as_acq_rel ; GFX8: bb.0.entry: + ; GFX8-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952 ; GFX8-NEXT: S_ENDPGM 0 ; ; GFX10WGP-LABEL: name: workgroup_one_as_acq_rel @@ -604,10 +634,12 @@ entry: define amdgpu_kernel void @workgroup_one_as_seq_cst() #0 { ; GFX6-LABEL: name: workgroup_one_as_seq_cst ; GFX6: bb.0.entry: + ; GFX6-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952 ; GFX6-NEXT: S_ENDPGM 0 ; ; GFX8-LABEL: name: workgroup_one_as_seq_cst ; GFX8: bb.0.entry: + ; GFX8-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952 ; GFX8-NEXT: S_ENDPGM 0 ; ; GFX10WGP-LABEL: name: workgroup_one_as_seq_cst @@ -639,10 +671,12 @@ entry: define amdgpu_kernel void @wavefront_one_as_acquire() #0 { ; GFX6-LABEL: name: wavefront_one_as_acquire ; GFX6: bb.0.entry: + ; GFX6-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952 ; GFX6-NEXT: S_ENDPGM 0 ; ; GFX8-LABEL: name: wavefront_one_as_acquire ; GFX8: bb.0.entry: + ; GFX8-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952 ; GFX8-NEXT: S_ENDPGM 0 ; ; GFX10WGP-LABEL: name: wavefront_one_as_acquire @@ -668,10 +702,12 @@ entry: define amdgpu_kernel void @wavefront_one_as_release() #0 { ; GFX6-LABEL: name: wavefront_one_as_release ; GFX6: bb.0.entry: + ; GFX6-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952 ; GFX6-NEXT: S_ENDPGM 0 ; ; GFX8-LABEL: name: wavefront_one_as_release ; GFX8: bb.0.entry: + ; GFX8-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952 ; GFX8-NEXT: S_ENDPGM 0 ; ; GFX10WGP-LABEL: name: wavefront_one_as_release @@ -697,10 +733,12 @@ entry: define amdgpu_kernel void @wavefront_one_as_acq_rel() #0 { ; GFX6-LABEL: name: wavefront_one_as_acq_rel ; GFX6: bb.0.entry: + ; GFX6-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952 ; GFX6-NEXT: S_ENDPGM 0 ; ; GFX8-LABEL: name: wavefront_one_as_acq_rel ; GFX8: bb.0.entry: + ; GFX8-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952 ; GFX8-NEXT: S_ENDPGM 0 ; ; GFX10WGP-LABEL: name: wavefront_one_as_acq_rel @@ -726,10 +764,12 @@ entry: define amdgpu_kernel void @wavefront_one_as_seq_cst() #0 { ; GFX6-LABEL: name: wavefront_one_as_seq_cst ; GFX6: bb.0.entry: + ; GFX6-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952 ; GFX6-NEXT: S_ENDPGM 0 ; ; GFX8-LABEL: name: wavefront_one_as_seq_cst ; GFX8: bb.0.entry: + ; GFX8-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952 ; GFX8-NEXT: S_ENDPGM 0 ; ; GFX10WGP-LABEL: name: wavefront_one_as_seq_cst @@ -755,12 +795,14 @@ entry: define amdgpu_kernel void @system_acquire() #0 { ; GFX6-LABEL: name: system_acquire ; GFX6: bb.0.entry: + ; GFX6-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952 ; GFX6-NEXT: S_WAITCNT_soft 112 ; GFX6-NEXT: BUFFER_WBINVL1 implicit $exec ; GFX6-NEXT: S_ENDPGM 0 ; ; GFX8-LABEL: name: system_acquire ; GFX8: bb.0.entry: + ; GFX8-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952 ; GFX8-NEXT: S_WAITCNT_soft 112 ; GFX8-NEXT: BUFFER_WBINVL1_VOL implicit $exec ; GFX8-NEXT: S_ENDPGM 0 @@ -804,11 +846,13 @@ entry: define amdgpu_kernel void @system_release() #0 { ; GFX6-LABEL: name: system_release ; GFX6: bb.0.entry: + ; GFX6-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952 ; GFX6-NEXT: S_WAITCNT_soft 112 ; GFX6-NEXT: S_ENDPGM 0 ; ; GFX8-LABEL: name: system_release ; GFX8: bb.0.entry: + ; GFX8-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952 ; GFX8-NEXT: S_WAITCNT_soft 112 ; GFX8-NEXT: S_ENDPGM 0 ; @@ -843,12 +887,14 @@ entry: define amdgpu_kernel void @system_acq_rel() #0 { ; GFX6-LABEL: name: system_acq_rel ; GFX6: bb.0.entry: + ; GFX6-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952 ; GFX6-NEXT: S_WAITCNT_soft 112 ; GFX6-NEXT: BUFFER_WBINVL1 implicit $exec ; GFX6-NEXT: S_ENDPGM 0 ; ; GFX8-LABEL: name: system_acq_rel ; GFX8: bb.0.entry: + ; GFX8-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952 ; GFX8-NEXT: S_WAITCNT_soft 112 ; GFX8-NEXT: BUFFER_WBINVL1_VOL implicit $exec ; GFX8-NEXT: S_ENDPGM 0 @@ -892,12 +938,14 @@ entry: define amdgpu_kernel void @system_seq_cst() #0 { ; GFX6-LABEL: name: system_seq_cst ; GFX6: bb.0.entry: + ; GFX6-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952 ; GFX6-NEXT: S_WAITCNT_soft 112 ; GFX6-NEXT: BUFFER_WBINVL1 implicit $exec ; GFX6-NEXT: S_ENDPGM 0 ; ; GFX8-LABEL: name: system_seq_cst ; GFX8: bb.0.entry: + ; GFX8-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952 ; GFX8-NEXT: S_WAITCNT_soft 112 ; GFX8-NEXT: BUFFER_WBINVL1_VOL implicit $exec ; GFX8-NEXT: S_ENDPGM 0 @@ -941,10 +989,12 @@ entry: define amdgpu_kernel void @singlethread_acquire() #0 { ; GFX6-LABEL: name: singlethread_acquire ; GFX6: bb.0.entry: + ; GFX6-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952 ; GFX6-NEXT: S_ENDPGM 0 ; ; GFX8-LABEL: name: singlethread_acquire ; GFX8: bb.0.entry: + ; GFX8-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952 ; GFX8-NEXT: S_ENDPGM 0 ; ; GFX10WGP-LABEL: name: singlethread_acquire @@ -970,10 +1020,12 @@ entry: define amdgpu_kernel void @singlethread_release() #0 { ; GFX6-LABEL: name: singlethread_release ; GFX6: bb.0.entry: + ; GFX6-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952 ; GFX6-NEXT: S_ENDPGM 0 ; ; GFX8-LABEL: name: singlethread_release ; GFX8: bb.0.entry: + ; GFX8-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952 ; GFX8-NEXT: S_ENDPGM 0 ; ; GFX10WGP-LABEL: name: singlethread_release @@ -999,10 +1051,12 @@ entry: define amdgpu_kernel void @singlethread_acq_rel() #0 { ; GFX6-LABEL: name: singlethread_acq_rel ; GFX6: bb.0.entry: + ; GFX6-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952 ; GFX6-NEXT: S_ENDPGM 0 ; ; GFX8-LABEL: name: singlethread_acq_rel ; GFX8: bb.0.entry: + ; GFX8-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952 ; GFX8-NEXT: S_ENDPGM 0 ; ; GFX10WGP-LABEL: name: singlethread_acq_rel @@ -1028,10 +1082,12 @@ entry: define amdgpu_kernel void @singlethread_seq_cst() #0 { ; GFX6-LABEL: name: singlethread_seq_cst ; GFX6: bb.0.entry: + ; GFX6-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952 ; GFX6-NEXT: S_ENDPGM 0 ; ; GFX8-LABEL: name: singlethread_seq_cst ; GFX8: bb.0.entry: + ; GFX8-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952 ; GFX8-NEXT: S_ENDPGM 0 ; ; GFX10WGP-LABEL: name: singlethread_seq_cst @@ -1057,12 +1113,14 @@ entry: define amdgpu_kernel void @agent_acquire() #0 { ; GFX6-LABEL: name: agent_acquire ; GFX6: bb.0.entry: + ; GFX6-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952 ; GFX6-NEXT: S_WAITCNT_soft 112 ; GFX6-NEXT: BUFFER_WBINVL1 implicit $exec ; GFX6-NEXT: S_ENDPGM 0 ; ; GFX8-LABEL: name: agent_acquire ; GFX8: bb.0.entry: + ; GFX8-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952 ; GFX8-NEXT: S_WAITCNT_soft 112 ; GFX8-NEXT: BUFFER_WBINVL1_VOL implicit $exec ; GFX8-NEXT: S_ENDPGM 0 @@ -1106,11 +1164,13 @@ entry: define amdgpu_kernel void @agent_release() #0 { ; GFX6-LABEL: name: agent_release ; GFX6: bb.0.entry: + ; GFX6-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952 ; GFX6-NEXT: S_WAITCNT_soft 112 ; GFX6-NEXT: S_ENDPGM 0 ; ; GFX8-LABEL: name: agent_release ; GFX8: bb.0.entry: + ; GFX8-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952 ; GFX8-NEXT: S_WAITCNT_soft 112 ; GFX8-NEXT: S_ENDPGM 0 ; @@ -1145,12 +1205,14 @@ entry: define amdgpu_kernel void @agent_acq_rel() #0 { ; GFX6-LABEL: name: agent_acq_rel ; GFX6: bb.0.entry: + ; GFX6-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952 ; GFX6-NEXT: S_WAITCNT_soft 112 ; GFX6-NEXT: BUFFER_WBINVL1 implicit $exec ; GFX6-NEXT: S_ENDPGM 0 ; ; GFX8-LABEL: name: agent_acq_rel ; GFX8: bb.0.entry: + ; GFX8-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952 ; GFX8-NEXT: S_WAITCNT_soft 112 ; GFX8-NEXT: BUFFER_WBINVL1_VOL implicit $exec ; GFX8-NEXT: S_ENDPGM 0 @@ -1194,12 +1256,14 @@ entry: define amdgpu_kernel void @agent_seq_cst() #0 { ; GFX6-LABEL: name: agent_seq_cst ; GFX6: bb.0.entry: + ; GFX6-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952 ; GFX6-NEXT: S_WAITCNT_soft 112 ; GFX6-NEXT: BUFFER_WBINVL1 implicit $exec ; GFX6-NEXT: S_ENDPGM 0 ; ; GFX8-LABEL: name: agent_seq_cst ; GFX8: bb.0.entry: + ; GFX8-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952 ; GFX8-NEXT: S_WAITCNT_soft 112 ; GFX8-NEXT: BUFFER_WBINVL1_VOL implicit $exec ; GFX8-NEXT: S_ENDPGM 0 @@ -1243,11 +1307,13 @@ entry: define amdgpu_kernel void @workgroup_acquire() #0 { ; GFX6-LABEL: name: workgroup_acquire ; GFX6: bb.0.entry: + ; GFX6-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952 ; GFX6-NEXT: S_WAITCNT_soft 127 ; GFX6-NEXT: S_ENDPGM 0 ; ; GFX8-LABEL: name: workgroup_acquire ; GFX8: bb.0.entry: + ; GFX8-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952 ; GFX8-NEXT: S_WAITCNT_soft 127 ; GFX8-NEXT: S_ENDPGM 0 ; @@ -1282,11 +1348,13 @@ entry: define amdgpu_kernel void @workgroup_release() #0 { ; GFX6-LABEL: name: workgroup_release ; GFX6: bb.0.entry: + ; GFX6-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952 ; GFX6-NEXT: S_WAITCNT_soft 127 ; GFX6-NEXT: S_ENDPGM 0 ; ; GFX8-LABEL: name: workgroup_release ; GFX8: bb.0.entry: + ; GFX8-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952 ; GFX8-NEXT: S_WAITCNT_soft 127 ; GFX8-NEXT: S_ENDPGM 0 ; @@ -1319,11 +1387,13 @@ entry: define amdgpu_kernel void @workgroup_acq_rel() #0 { ; GFX6-LABEL: name: workgroup_acq_rel ; GFX6: bb.0.entry: + ; GFX6-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952 ; GFX6-NEXT: S_WAITCNT_soft 127 ; GFX6-NEXT: S_ENDPGM 0 ; ; GFX8-LABEL: name: workgroup_acq_rel ; GFX8: bb.0.entry: + ; GFX8-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952 ; GFX8-NEXT: S_WAITCNT_soft 127 ; GFX8-NEXT: S_ENDPGM 0 ; @@ -1358,11 +1428,13 @@ entry: define amdgpu_kernel void @workgroup_seq_cst() #0 { ; GFX6-LABEL: name: workgroup_seq_cst ; GFX6: bb.0.entry: + ; GFX6-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952 ; GFX6-NEXT: S_WAITCNT_soft 127 ; GFX6-NEXT: S_ENDPGM 0 ; ; GFX8-LABEL: name: workgroup_seq_cst ; GFX8: bb.0.entry: + ; GFX8-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952 ; GFX8-NEXT: S_WAITCNT_soft 127 ; GFX8-NEXT: S_ENDPGM 0 ; @@ -1397,10 +1469,12 @@ entry: define amdgpu_kernel void @wavefront_acquire() #0 { ; GFX6-LABEL: name: wavefront_acquire ; GFX6: bb.0.entry: + ; GFX6-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952 ; GFX6-NEXT: S_ENDPGM 0 ; ; GFX8-LABEL: name: wavefront_acquire ; GFX8: bb.0.entry: + ; GFX8-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952 ; GFX8-NEXT: S_ENDPGM 0 ; ; GFX10WGP-LABEL: name: wavefront_acquire @@ -1426,10 +1500,12 @@ entry: define amdgpu_kernel void @wavefront_release() #0 { ; GFX6-LABEL: name: wavefront_release ; GFX6: bb.0.entry: + ; GFX6-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952 ; GFX6-NEXT: S_ENDPGM 0 ; ; GFX8-LABEL: name: wavefront_release ; GFX8: bb.0.entry: + ; GFX8-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952 ; GFX8-NEXT: S_ENDPGM 0 ; ; GFX10WGP-LABEL: name: wavefront_release @@ -1455,10 +1531,12 @@ entry: define amdgpu_kernel void @wavefront_acq_rel() #0 { ; GFX6-LABEL: name: wavefront_acq_rel ; GFX6: bb.0.entry: + ; GFX6-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952 ; GFX6-NEXT: S_ENDPGM 0 ; ; GFX8-LABEL: name: wavefront_acq_rel ; GFX8: bb.0.entry: + ; GFX8-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952 ; GFX8-NEXT: S_ENDPGM 0 ; ; GFX10WGP-LABEL: name: wavefront_acq_rel @@ -1484,10 +1562,12 @@ entry: define amdgpu_kernel void @wavefront_seq_cst() #0 { ; GFX6-LABEL: name: wavefront_seq_cst ; GFX6: bb.0.entry: + ; GFX6-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952 ; GFX6-NEXT: S_ENDPGM 0 ; ; GFX8-LABEL: name: wavefront_seq_cst ; GFX8: bb.0.entry: + ; GFX8-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952 ; GFX8-NEXT: S_ENDPGM 0 ; ; GFX10WGP-LABEL: name: wavefront_seq_cst diff --git a/llvm/test/CodeGen/AMDGPU/branch-condition-and.ll b/llvm/test/CodeGen/AMDGPU/branch-condition-and.ll index 2bf4a2c028fdc..9fd44da40453f 100644 --- a/llvm/test/CodeGen/AMDGPU/branch-condition-and.ll +++ b/llvm/test/CodeGen/AMDGPU/branch-condition-and.ll @@ -17,12 +17,14 @@ define amdgpu_ps void @ham(float %arg, float %arg1) #0 { ; GCN-NEXT: v_cmp_lt_f32_e64 s[0:1], 0, v1 ; GCN-NEXT: s_and_b64 s[0:1], vcc, s[0:1] ; GCN-NEXT: s_and_saveexec_b64 s[2:3], s[0:1] +; GCN-NEXT: s_cbranch_execz .LBB0_2 ; GCN-NEXT: ; %bb.1: ; %bb4 ; GCN-NEXT: v_mov_b32_e32 v0, 4 ; GCN-NEXT: s_mov_b32 m0, -1 ; GCN-NEXT: ds_write_b32 v0, v0 +; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: ; divergent unreachable -; GCN-NEXT: ; %bb.2: ; %UnifiedReturnBlock +; GCN-NEXT: .LBB0_2: ; %UnifiedReturnBlock ; GCN-NEXT: s_endpgm bb: %tmp = fcmp ogt float %arg, 0.000000e+00 diff --git a/llvm/test/CodeGen/AMDGPU/cf-loop-on-constant.ll b/llvm/test/CodeGen/AMDGPU/cf-loop-on-constant.ll index f78cb0daee5c9..8ee5a1ef932b9 100644 --- a/llvm/test/CodeGen/AMDGPU/cf-loop-on-constant.ll +++ b/llvm/test/CodeGen/AMDGPU/cf-loop-on-constant.ll @@ -404,7 +404,7 @@ define amdgpu_kernel void @loop_arg_0(ptr addrspace(3) %ptr, i32 %n) nounwind { ; GCN_DBG-NEXT: v_mov_b32_e32 v0, 0 ; GCN_DBG-NEXT: s_mov_b32 m0, -1 ; GCN_DBG-NEXT: ds_read_u8 v0, v0 -; GCN_DBG-NEXT: s_waitcnt lgkmcnt(0) +; GCN_DBG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN_DBG-NEXT: v_readfirstlane_b32 s0, v0 ; GCN_DBG-NEXT: s_and_b32 s0, 1, s0 ; GCN_DBG-NEXT: s_cmp_eq_u32 s0, 1 diff --git a/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll b/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll index b5665835eaf7a..8c2011b49ceb9 100644 --- a/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll +++ b/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll @@ -7807,10 +7807,12 @@ define amdgpu_kernel void @multi_same_block(i32 %arg) { ; NOOPT-NEXT: ; implicit-def: $sgpr0 ; NOOPT-NEXT: v_mov_b32_e32 v0, s0 ; NOOPT-NEXT: ds_write_b32 v0, v2 +; NOOPT-NEXT: s_waitcnt vmcnt(0) ; NOOPT-NEXT: s_mov_b32 m0, -1 ; NOOPT-NEXT: ; implicit-def: $sgpr0 ; NOOPT-NEXT: v_mov_b32_e32 v0, s0 ; NOOPT-NEXT: ds_write_b32 v0, v1 +; NOOPT-NEXT: s_waitcnt vmcnt(0) ; NOOPT-NEXT: s_endpgm ; ; SI-MOVREL-LABEL: multi_same_block: diff --git a/llvm/test/CodeGen/AMDGPU/kernel-vgpr-spill-mubuf-with-voffset.ll b/llvm/test/CodeGen/AMDGPU/kernel-vgpr-spill-mubuf-with-voffset.ll index 0681263b7428e..04e352984b948 100644 --- a/llvm/test/CodeGen/AMDGPU/kernel-vgpr-spill-mubuf-with-voffset.ll +++ b/llvm/test/CodeGen/AMDGPU/kernel-vgpr-spill-mubuf-with-voffset.ll @@ -71,6 +71,7 @@ define amdgpu_kernel void @test_kernel(i32 %val) #0 { ; CHECK-NEXT: v_mov_b32_e32 v0, s4 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: ds_write_b32 v0, v1 +; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: s_endpgm ; CHECK-NEXT: .LBB0_2: ; %end ; CHECK-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.update.dpp.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.update.dpp.ll index 44415657b6336..c4451fd0891dc 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.update.dpp.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.update.dpp.ll @@ -165,8 +165,9 @@ define weak_odr amdgpu_kernel void @dpp_test1(ptr %arg) local_unnamed_addr { ; GFX8-NOOPT-NEXT: v_lshlrev_b32_e64 v3, s0, v0 ; GFX8-NOOPT-NEXT: s_mov_b32 m0, -1 ; GFX8-NOOPT-NEXT: ds_read_b32 v0, v3 -; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NOOPT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NOOPT-NEXT: s_barrier +; GFX8-NOOPT-NEXT: s_waitcnt vmcnt(0) ; GFX8-NOOPT-NEXT: v_add_u32_e64 v1, s[0:1], v0, v0 ; GFX8-NOOPT-NEXT: v_mov_b32_e32 v0, 0 ; GFX8-NOOPT-NEXT: s_nop 1 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence-mmra-local.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence-mmra-local.ll index 971015b391ca8..445f597516e8c 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence-mmra-local.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence-mmra-local.ll @@ -16,12 +16,12 @@ define amdgpu_kernel void @workgroup_acquire_fence() { ; GFX6-LABEL: workgroup_acquire_fence: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: workgroup_acquire_fence: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: workgroup_acquire_fence: @@ -36,12 +36,12 @@ define amdgpu_kernel void @workgroup_acquire_fence() { ; ; SKIP-CACHE-INV-LABEL: workgroup_acquire_fence: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: workgroup_acquire_fence: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: workgroup_acquire_fence: @@ -50,7 +50,7 @@ define amdgpu_kernel void @workgroup_acquire_fence() { ; ; GFX942-NOTTGSPLIT-LABEL: workgroup_acquire_fence: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: workgroup_acquire_fence: @@ -84,12 +84,12 @@ entry: define amdgpu_kernel void @workgroup_release_fence() { ; GFX6-LABEL: workgroup_release_fence: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: workgroup_release_fence: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: workgroup_release_fence: @@ -104,12 +104,12 @@ define amdgpu_kernel void @workgroup_release_fence() { ; ; SKIP-CACHE-INV-LABEL: workgroup_release_fence: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: workgroup_release_fence: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: workgroup_release_fence: @@ -118,7 +118,7 @@ define amdgpu_kernel void @workgroup_release_fence() { ; ; GFX942-NOTTGSPLIT-LABEL: workgroup_release_fence: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: workgroup_release_fence: @@ -150,12 +150,12 @@ entry: define amdgpu_kernel void @workgroup_acq_rel_fence() { ; GFX6-LABEL: workgroup_acq_rel_fence: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: workgroup_acq_rel_fence: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: workgroup_acq_rel_fence: @@ -170,12 +170,12 @@ define amdgpu_kernel void @workgroup_acq_rel_fence() { ; ; SKIP-CACHE-INV-LABEL: workgroup_acq_rel_fence: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: workgroup_acq_rel_fence: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: workgroup_acq_rel_fence: @@ -184,7 +184,7 @@ define amdgpu_kernel void @workgroup_acq_rel_fence() { ; ; GFX942-NOTTGSPLIT-LABEL: workgroup_acq_rel_fence: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: workgroup_acq_rel_fence: @@ -216,12 +216,12 @@ entry: define amdgpu_kernel void @workgroup_seq_cst_fence() { ; GFX6-LABEL: workgroup_seq_cst_fence: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: workgroup_seq_cst_fence: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: workgroup_seq_cst_fence: @@ -236,12 +236,12 @@ define amdgpu_kernel void @workgroup_seq_cst_fence() { ; ; SKIP-CACHE-INV-LABEL: workgroup_seq_cst_fence: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: workgroup_seq_cst_fence: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: workgroup_seq_cst_fence: @@ -250,7 +250,7 @@ define amdgpu_kernel void @workgroup_seq_cst_fence() { ; ; GFX942-NOTTGSPLIT-LABEL: workgroup_seq_cst_fence: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: workgroup_seq_cst_fence: @@ -282,10 +282,12 @@ entry: define amdgpu_kernel void @workgroup_one_as_acquire_fence() { ; GFX6-LABEL: workgroup_one_as_acquire_fence: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: workgroup_one_as_acquire_fence: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: workgroup_one_as_acquire_fence: @@ -298,10 +300,12 @@ define amdgpu_kernel void @workgroup_one_as_acquire_fence() { ; ; SKIP-CACHE-INV-LABEL: workgroup_one_as_acquire_fence: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: workgroup_one_as_acquire_fence: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: workgroup_one_as_acquire_fence: @@ -310,6 +314,7 @@ define amdgpu_kernel void @workgroup_one_as_acquire_fence() { ; ; GFX942-NOTTGSPLIT-LABEL: workgroup_one_as_acquire_fence: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: workgroup_one_as_acquire_fence: @@ -339,10 +344,12 @@ entry: define amdgpu_kernel void @workgroup_one_as_release_fence() { ; GFX6-LABEL: workgroup_one_as_release_fence: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: workgroup_one_as_release_fence: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: workgroup_one_as_release_fence: @@ -355,10 +362,12 @@ define amdgpu_kernel void @workgroup_one_as_release_fence() { ; ; SKIP-CACHE-INV-LABEL: workgroup_one_as_release_fence: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: workgroup_one_as_release_fence: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: workgroup_one_as_release_fence: @@ -367,6 +376,7 @@ define amdgpu_kernel void @workgroup_one_as_release_fence() { ; ; GFX942-NOTTGSPLIT-LABEL: workgroup_one_as_release_fence: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: workgroup_one_as_release_fence: @@ -396,10 +406,12 @@ entry: define amdgpu_kernel void @workgroup_one_as_acq_rel_fence() { ; GFX6-LABEL: workgroup_one_as_acq_rel_fence: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: workgroup_one_as_acq_rel_fence: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: workgroup_one_as_acq_rel_fence: @@ -412,10 +424,12 @@ define amdgpu_kernel void @workgroup_one_as_acq_rel_fence() { ; ; SKIP-CACHE-INV-LABEL: workgroup_one_as_acq_rel_fence: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: workgroup_one_as_acq_rel_fence: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: workgroup_one_as_acq_rel_fence: @@ -424,6 +438,7 @@ define amdgpu_kernel void @workgroup_one_as_acq_rel_fence() { ; ; GFX942-NOTTGSPLIT-LABEL: workgroup_one_as_acq_rel_fence: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: workgroup_one_as_acq_rel_fence: @@ -453,10 +468,12 @@ entry: define amdgpu_kernel void @workgroup_one_as_seq_cst_fence() { ; GFX6-LABEL: workgroup_one_as_seq_cst_fence: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: workgroup_one_as_seq_cst_fence: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: workgroup_one_as_seq_cst_fence: @@ -469,10 +486,12 @@ define amdgpu_kernel void @workgroup_one_as_seq_cst_fence() { ; ; SKIP-CACHE-INV-LABEL: workgroup_one_as_seq_cst_fence: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: workgroup_one_as_seq_cst_fence: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: workgroup_one_as_seq_cst_fence: @@ -481,6 +500,7 @@ define amdgpu_kernel void @workgroup_one_as_seq_cst_fence() { ; ; GFX942-NOTTGSPLIT-LABEL: workgroup_one_as_seq_cst_fence: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: workgroup_one_as_seq_cst_fence: @@ -510,12 +530,12 @@ entry: define amdgpu_kernel void @agent_acquire_fence() { ; GFX6-LABEL: agent_acquire_fence: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: agent_acquire_fence: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: agent_acquire_fence: @@ -530,12 +550,12 @@ define amdgpu_kernel void @agent_acquire_fence() { ; ; SKIP-CACHE-INV-LABEL: agent_acquire_fence: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: agent_acquire_fence: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: agent_acquire_fence: @@ -544,7 +564,7 @@ define amdgpu_kernel void @agent_acquire_fence() { ; ; GFX942-NOTTGSPLIT-LABEL: agent_acquire_fence: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: agent_acquire_fence: @@ -578,12 +598,12 @@ entry: define amdgpu_kernel void @agent_release_fence() { ; GFX6-LABEL: agent_release_fence: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: agent_release_fence: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: agent_release_fence: @@ -598,12 +618,12 @@ define amdgpu_kernel void @agent_release_fence() { ; ; SKIP-CACHE-INV-LABEL: agent_release_fence: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: agent_release_fence: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: agent_release_fence: @@ -612,7 +632,7 @@ define amdgpu_kernel void @agent_release_fence() { ; ; GFX942-NOTTGSPLIT-LABEL: agent_release_fence: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: agent_release_fence: @@ -644,12 +664,12 @@ entry: define amdgpu_kernel void @agent_acq_rel_fence() { ; GFX6-LABEL: agent_acq_rel_fence: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: agent_acq_rel_fence: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: agent_acq_rel_fence: @@ -664,12 +684,12 @@ define amdgpu_kernel void @agent_acq_rel_fence() { ; ; SKIP-CACHE-INV-LABEL: agent_acq_rel_fence: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: agent_acq_rel_fence: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: agent_acq_rel_fence: @@ -678,7 +698,7 @@ define amdgpu_kernel void @agent_acq_rel_fence() { ; ; GFX942-NOTTGSPLIT-LABEL: agent_acq_rel_fence: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: agent_acq_rel_fence: @@ -710,12 +730,12 @@ entry: define amdgpu_kernel void @agent_seq_cst_fence() { ; GFX6-LABEL: agent_seq_cst_fence: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: agent_seq_cst_fence: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: agent_seq_cst_fence: @@ -730,12 +750,12 @@ define amdgpu_kernel void @agent_seq_cst_fence() { ; ; SKIP-CACHE-INV-LABEL: agent_seq_cst_fence: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: agent_seq_cst_fence: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: agent_seq_cst_fence: @@ -744,7 +764,7 @@ define amdgpu_kernel void @agent_seq_cst_fence() { ; ; GFX942-NOTTGSPLIT-LABEL: agent_seq_cst_fence: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: agent_seq_cst_fence: @@ -776,10 +796,12 @@ entry: define amdgpu_kernel void @agent_one_as_acquire_fence() { ; GFX6-LABEL: agent_one_as_acquire_fence: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: agent_one_as_acquire_fence: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: agent_one_as_acquire_fence: @@ -792,10 +814,12 @@ define amdgpu_kernel void @agent_one_as_acquire_fence() { ; ; SKIP-CACHE-INV-LABEL: agent_one_as_acquire_fence: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: agent_one_as_acquire_fence: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: agent_one_as_acquire_fence: @@ -804,6 +828,7 @@ define amdgpu_kernel void @agent_one_as_acquire_fence() { ; ; GFX942-NOTTGSPLIT-LABEL: agent_one_as_acquire_fence: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: agent_one_as_acquire_fence: @@ -833,10 +858,12 @@ entry: define amdgpu_kernel void @agent_one_as_release_fence() { ; GFX6-LABEL: agent_one_as_release_fence: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: agent_one_as_release_fence: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: agent_one_as_release_fence: @@ -849,10 +876,12 @@ define amdgpu_kernel void @agent_one_as_release_fence() { ; ; SKIP-CACHE-INV-LABEL: agent_one_as_release_fence: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: agent_one_as_release_fence: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: agent_one_as_release_fence: @@ -861,6 +890,7 @@ define amdgpu_kernel void @agent_one_as_release_fence() { ; ; GFX942-NOTTGSPLIT-LABEL: agent_one_as_release_fence: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: agent_one_as_release_fence: @@ -890,10 +920,12 @@ entry: define amdgpu_kernel void @agent_one_as_acq_rel_fence() { ; GFX6-LABEL: agent_one_as_acq_rel_fence: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: agent_one_as_acq_rel_fence: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: agent_one_as_acq_rel_fence: @@ -906,10 +938,12 @@ define amdgpu_kernel void @agent_one_as_acq_rel_fence() { ; ; SKIP-CACHE-INV-LABEL: agent_one_as_acq_rel_fence: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: agent_one_as_acq_rel_fence: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: agent_one_as_acq_rel_fence: @@ -918,6 +952,7 @@ define amdgpu_kernel void @agent_one_as_acq_rel_fence() { ; ; GFX942-NOTTGSPLIT-LABEL: agent_one_as_acq_rel_fence: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: agent_one_as_acq_rel_fence: @@ -947,10 +982,12 @@ entry: define amdgpu_kernel void @agent_one_as_seq_cst_fence() { ; GFX6-LABEL: agent_one_as_seq_cst_fence: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: agent_one_as_seq_cst_fence: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: agent_one_as_seq_cst_fence: @@ -963,10 +1000,12 @@ define amdgpu_kernel void @agent_one_as_seq_cst_fence() { ; ; SKIP-CACHE-INV-LABEL: agent_one_as_seq_cst_fence: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: agent_one_as_seq_cst_fence: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: agent_one_as_seq_cst_fence: @@ -975,6 +1014,7 @@ define amdgpu_kernel void @agent_one_as_seq_cst_fence() { ; ; GFX942-NOTTGSPLIT-LABEL: agent_one_as_seq_cst_fence: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: agent_one_as_seq_cst_fence: @@ -1004,12 +1044,12 @@ entry: define amdgpu_kernel void @system_acquire_fence() { ; GFX6-LABEL: system_acquire_fence: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: system_acquire_fence: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: system_acquire_fence: @@ -1024,12 +1064,12 @@ define amdgpu_kernel void @system_acquire_fence() { ; ; SKIP-CACHE-INV-LABEL: system_acquire_fence: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: system_acquire_fence: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: system_acquire_fence: @@ -1038,7 +1078,7 @@ define amdgpu_kernel void @system_acquire_fence() { ; ; GFX942-NOTTGSPLIT-LABEL: system_acquire_fence: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: system_acquire_fence: @@ -1072,12 +1112,12 @@ entry: define amdgpu_kernel void @system_release_fence() { ; GFX6-LABEL: system_release_fence: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: system_release_fence: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: system_release_fence: @@ -1092,12 +1132,12 @@ define amdgpu_kernel void @system_release_fence() { ; ; SKIP-CACHE-INV-LABEL: system_release_fence: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: system_release_fence: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: system_release_fence: @@ -1106,7 +1146,7 @@ define amdgpu_kernel void @system_release_fence() { ; ; GFX942-NOTTGSPLIT-LABEL: system_release_fence: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: system_release_fence: @@ -1138,12 +1178,12 @@ entry: define amdgpu_kernel void @system_acq_rel_fence() { ; GFX6-LABEL: system_acq_rel_fence: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: system_acq_rel_fence: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: system_acq_rel_fence: @@ -1158,12 +1198,12 @@ define amdgpu_kernel void @system_acq_rel_fence() { ; ; SKIP-CACHE-INV-LABEL: system_acq_rel_fence: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: system_acq_rel_fence: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: system_acq_rel_fence: @@ -1172,7 +1212,7 @@ define amdgpu_kernel void @system_acq_rel_fence() { ; ; GFX942-NOTTGSPLIT-LABEL: system_acq_rel_fence: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: system_acq_rel_fence: @@ -1204,12 +1244,12 @@ entry: define amdgpu_kernel void @system_seq_cst_fence() { ; GFX6-LABEL: system_seq_cst_fence: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: system_seq_cst_fence: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: system_seq_cst_fence: @@ -1224,12 +1264,12 @@ define amdgpu_kernel void @system_seq_cst_fence() { ; ; SKIP-CACHE-INV-LABEL: system_seq_cst_fence: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: system_seq_cst_fence: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: system_seq_cst_fence: @@ -1238,7 +1278,7 @@ define amdgpu_kernel void @system_seq_cst_fence() { ; ; GFX942-NOTTGSPLIT-LABEL: system_seq_cst_fence: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: system_seq_cst_fence: @@ -1270,10 +1310,12 @@ entry: define amdgpu_kernel void @system_one_as_acquire_fence() { ; GFX6-LABEL: system_one_as_acquire_fence: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: system_one_as_acquire_fence: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: system_one_as_acquire_fence: @@ -1286,10 +1328,12 @@ define amdgpu_kernel void @system_one_as_acquire_fence() { ; ; SKIP-CACHE-INV-LABEL: system_one_as_acquire_fence: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: system_one_as_acquire_fence: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: system_one_as_acquire_fence: @@ -1298,6 +1342,7 @@ define amdgpu_kernel void @system_one_as_acquire_fence() { ; ; GFX942-NOTTGSPLIT-LABEL: system_one_as_acquire_fence: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: system_one_as_acquire_fence: @@ -1327,10 +1372,12 @@ entry: define amdgpu_kernel void @system_one_as_release_fence() { ; GFX6-LABEL: system_one_as_release_fence: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: system_one_as_release_fence: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: system_one_as_release_fence: @@ -1343,10 +1390,12 @@ define amdgpu_kernel void @system_one_as_release_fence() { ; ; SKIP-CACHE-INV-LABEL: system_one_as_release_fence: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: system_one_as_release_fence: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: system_one_as_release_fence: @@ -1355,6 +1404,7 @@ define amdgpu_kernel void @system_one_as_release_fence() { ; ; GFX942-NOTTGSPLIT-LABEL: system_one_as_release_fence: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: system_one_as_release_fence: @@ -1384,10 +1434,12 @@ entry: define amdgpu_kernel void @system_one_as_acq_rel_fence() { ; GFX6-LABEL: system_one_as_acq_rel_fence: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: system_one_as_acq_rel_fence: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: system_one_as_acq_rel_fence: @@ -1400,10 +1452,12 @@ define amdgpu_kernel void @system_one_as_acq_rel_fence() { ; ; SKIP-CACHE-INV-LABEL: system_one_as_acq_rel_fence: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: system_one_as_acq_rel_fence: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: system_one_as_acq_rel_fence: @@ -1412,6 +1466,7 @@ define amdgpu_kernel void @system_one_as_acq_rel_fence() { ; ; GFX942-NOTTGSPLIT-LABEL: system_one_as_acq_rel_fence: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: system_one_as_acq_rel_fence: @@ -1441,10 +1496,12 @@ entry: define amdgpu_kernel void @system_one_as_seq_cst_fence() { ; GFX6-LABEL: system_one_as_seq_cst_fence: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: system_one_as_seq_cst_fence: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: system_one_as_seq_cst_fence: @@ -1457,10 +1514,12 @@ define amdgpu_kernel void @system_one_as_seq_cst_fence() { ; ; SKIP-CACHE-INV-LABEL: system_one_as_seq_cst_fence: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: system_one_as_seq_cst_fence: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: system_one_as_seq_cst_fence: @@ -1469,6 +1528,7 @@ define amdgpu_kernel void @system_one_as_seq_cst_fence() { ; ; GFX942-NOTTGSPLIT-LABEL: system_one_as_seq_cst_fence: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: system_one_as_seq_cst_fence: diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence.ll index 0e459ed0f1243..0a68ec2bfa1b9 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence.ll @@ -16,10 +16,12 @@ define amdgpu_kernel void @singlethread_acquire_fence() { ; GFX6-LABEL: singlethread_acquire_fence: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: singlethread_acquire_fence: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: singlethread_acquire_fence: @@ -32,10 +34,12 @@ define amdgpu_kernel void @singlethread_acquire_fence() { ; ; SKIP-CACHE-INV-LABEL: singlethread_acquire_fence: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: singlethread_acquire_fence: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: singlethread_acquire_fence: @@ -44,6 +48,7 @@ define amdgpu_kernel void @singlethread_acquire_fence() { ; ; GFX942-NOTTGSPLIT-LABEL: singlethread_acquire_fence: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: singlethread_acquire_fence: @@ -73,10 +78,12 @@ entry: define amdgpu_kernel void @singlethread_release_fence() { ; GFX6-LABEL: singlethread_release_fence: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: singlethread_release_fence: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: singlethread_release_fence: @@ -89,10 +96,12 @@ define amdgpu_kernel void @singlethread_release_fence() { ; ; SKIP-CACHE-INV-LABEL: singlethread_release_fence: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: singlethread_release_fence: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: singlethread_release_fence: @@ -101,6 +110,7 @@ define amdgpu_kernel void @singlethread_release_fence() { ; ; GFX942-NOTTGSPLIT-LABEL: singlethread_release_fence: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: singlethread_release_fence: @@ -130,10 +140,12 @@ entry: define amdgpu_kernel void @singlethread_acq_rel_fence() { ; GFX6-LABEL: singlethread_acq_rel_fence: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: singlethread_acq_rel_fence: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: singlethread_acq_rel_fence: @@ -146,10 +158,12 @@ define amdgpu_kernel void @singlethread_acq_rel_fence() { ; ; SKIP-CACHE-INV-LABEL: singlethread_acq_rel_fence: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: singlethread_acq_rel_fence: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: singlethread_acq_rel_fence: @@ -158,6 +172,7 @@ define amdgpu_kernel void @singlethread_acq_rel_fence() { ; ; GFX942-NOTTGSPLIT-LABEL: singlethread_acq_rel_fence: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: singlethread_acq_rel_fence: @@ -187,10 +202,12 @@ entry: define amdgpu_kernel void @singlethread_seq_cst_fence() { ; GFX6-LABEL: singlethread_seq_cst_fence: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: singlethread_seq_cst_fence: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: singlethread_seq_cst_fence: @@ -203,10 +220,12 @@ define amdgpu_kernel void @singlethread_seq_cst_fence() { ; ; SKIP-CACHE-INV-LABEL: singlethread_seq_cst_fence: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: singlethread_seq_cst_fence: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: singlethread_seq_cst_fence: @@ -215,6 +234,7 @@ define amdgpu_kernel void @singlethread_seq_cst_fence() { ; ; GFX942-NOTTGSPLIT-LABEL: singlethread_seq_cst_fence: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: singlethread_seq_cst_fence: @@ -244,10 +264,12 @@ entry: define amdgpu_kernel void @singlethread_one_as_acquire_fence() { ; GFX6-LABEL: singlethread_one_as_acquire_fence: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: singlethread_one_as_acquire_fence: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: singlethread_one_as_acquire_fence: @@ -260,10 +282,12 @@ define amdgpu_kernel void @singlethread_one_as_acquire_fence() { ; ; SKIP-CACHE-INV-LABEL: singlethread_one_as_acquire_fence: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: singlethread_one_as_acquire_fence: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: singlethread_one_as_acquire_fence: @@ -272,6 +296,7 @@ define amdgpu_kernel void @singlethread_one_as_acquire_fence() { ; ; GFX942-NOTTGSPLIT-LABEL: singlethread_one_as_acquire_fence: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: singlethread_one_as_acquire_fence: @@ -301,10 +326,12 @@ entry: define amdgpu_kernel void @singlethread_one_as_release_fence() { ; GFX6-LABEL: singlethread_one_as_release_fence: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: singlethread_one_as_release_fence: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: singlethread_one_as_release_fence: @@ -317,10 +344,12 @@ define amdgpu_kernel void @singlethread_one_as_release_fence() { ; ; SKIP-CACHE-INV-LABEL: singlethread_one_as_release_fence: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: singlethread_one_as_release_fence: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: singlethread_one_as_release_fence: @@ -329,6 +358,7 @@ define amdgpu_kernel void @singlethread_one_as_release_fence() { ; ; GFX942-NOTTGSPLIT-LABEL: singlethread_one_as_release_fence: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: singlethread_one_as_release_fence: @@ -358,10 +388,12 @@ entry: define amdgpu_kernel void @singlethread_one_as_acq_rel_fence() { ; GFX6-LABEL: singlethread_one_as_acq_rel_fence: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: singlethread_one_as_acq_rel_fence: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: singlethread_one_as_acq_rel_fence: @@ -374,10 +406,12 @@ define amdgpu_kernel void @singlethread_one_as_acq_rel_fence() { ; ; SKIP-CACHE-INV-LABEL: singlethread_one_as_acq_rel_fence: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: singlethread_one_as_acq_rel_fence: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: singlethread_one_as_acq_rel_fence: @@ -386,6 +420,7 @@ define amdgpu_kernel void @singlethread_one_as_acq_rel_fence() { ; ; GFX942-NOTTGSPLIT-LABEL: singlethread_one_as_acq_rel_fence: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: singlethread_one_as_acq_rel_fence: @@ -415,10 +450,12 @@ entry: define amdgpu_kernel void @singlethread_one_as_seq_cst_fence() { ; GFX6-LABEL: singlethread_one_as_seq_cst_fence: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: singlethread_one_as_seq_cst_fence: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: singlethread_one_as_seq_cst_fence: @@ -431,10 +468,12 @@ define amdgpu_kernel void @singlethread_one_as_seq_cst_fence() { ; ; SKIP-CACHE-INV-LABEL: singlethread_one_as_seq_cst_fence: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: singlethread_one_as_seq_cst_fence: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: singlethread_one_as_seq_cst_fence: @@ -443,6 +482,7 @@ define amdgpu_kernel void @singlethread_one_as_seq_cst_fence() { ; ; GFX942-NOTTGSPLIT-LABEL: singlethread_one_as_seq_cst_fence: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: singlethread_one_as_seq_cst_fence: @@ -472,10 +512,12 @@ entry: define amdgpu_kernel void @wavefront_acquire_fence() { ; GFX6-LABEL: wavefront_acquire_fence: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: wavefront_acquire_fence: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: wavefront_acquire_fence: @@ -488,10 +530,12 @@ define amdgpu_kernel void @wavefront_acquire_fence() { ; ; SKIP-CACHE-INV-LABEL: wavefront_acquire_fence: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: wavefront_acquire_fence: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: wavefront_acquire_fence: @@ -500,6 +544,7 @@ define amdgpu_kernel void @wavefront_acquire_fence() { ; ; GFX942-NOTTGSPLIT-LABEL: wavefront_acquire_fence: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: wavefront_acquire_fence: @@ -529,10 +574,12 @@ entry: define amdgpu_kernel void @wavefront_release_fence() { ; GFX6-LABEL: wavefront_release_fence: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: wavefront_release_fence: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: wavefront_release_fence: @@ -545,10 +592,12 @@ define amdgpu_kernel void @wavefront_release_fence() { ; ; SKIP-CACHE-INV-LABEL: wavefront_release_fence: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: wavefront_release_fence: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: wavefront_release_fence: @@ -557,6 +606,7 @@ define amdgpu_kernel void @wavefront_release_fence() { ; ; GFX942-NOTTGSPLIT-LABEL: wavefront_release_fence: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: wavefront_release_fence: @@ -586,10 +636,12 @@ entry: define amdgpu_kernel void @wavefront_acq_rel_fence() { ; GFX6-LABEL: wavefront_acq_rel_fence: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: wavefront_acq_rel_fence: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: wavefront_acq_rel_fence: @@ -602,10 +654,12 @@ define amdgpu_kernel void @wavefront_acq_rel_fence() { ; ; SKIP-CACHE-INV-LABEL: wavefront_acq_rel_fence: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: wavefront_acq_rel_fence: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: wavefront_acq_rel_fence: @@ -614,6 +668,7 @@ define amdgpu_kernel void @wavefront_acq_rel_fence() { ; ; GFX942-NOTTGSPLIT-LABEL: wavefront_acq_rel_fence: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: wavefront_acq_rel_fence: @@ -643,10 +698,12 @@ entry: define amdgpu_kernel void @wavefront_seq_cst_fence() { ; GFX6-LABEL: wavefront_seq_cst_fence: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: wavefront_seq_cst_fence: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: wavefront_seq_cst_fence: @@ -659,10 +716,12 @@ define amdgpu_kernel void @wavefront_seq_cst_fence() { ; ; SKIP-CACHE-INV-LABEL: wavefront_seq_cst_fence: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: wavefront_seq_cst_fence: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: wavefront_seq_cst_fence: @@ -671,6 +730,7 @@ define amdgpu_kernel void @wavefront_seq_cst_fence() { ; ; GFX942-NOTTGSPLIT-LABEL: wavefront_seq_cst_fence: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: wavefront_seq_cst_fence: @@ -700,10 +760,12 @@ entry: define amdgpu_kernel void @wavefront_one_as_acquire_fence() { ; GFX6-LABEL: wavefront_one_as_acquire_fence: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: wavefront_one_as_acquire_fence: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: wavefront_one_as_acquire_fence: @@ -716,10 +778,12 @@ define amdgpu_kernel void @wavefront_one_as_acquire_fence() { ; ; SKIP-CACHE-INV-LABEL: wavefront_one_as_acquire_fence: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: wavefront_one_as_acquire_fence: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: wavefront_one_as_acquire_fence: @@ -728,6 +792,7 @@ define amdgpu_kernel void @wavefront_one_as_acquire_fence() { ; ; GFX942-NOTTGSPLIT-LABEL: wavefront_one_as_acquire_fence: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: wavefront_one_as_acquire_fence: @@ -757,10 +822,12 @@ entry: define amdgpu_kernel void @wavefront_one_as_release_fence() { ; GFX6-LABEL: wavefront_one_as_release_fence: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: wavefront_one_as_release_fence: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: wavefront_one_as_release_fence: @@ -773,10 +840,12 @@ define amdgpu_kernel void @wavefront_one_as_release_fence() { ; ; SKIP-CACHE-INV-LABEL: wavefront_one_as_release_fence: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: wavefront_one_as_release_fence: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: wavefront_one_as_release_fence: @@ -785,6 +854,7 @@ define amdgpu_kernel void @wavefront_one_as_release_fence() { ; ; GFX942-NOTTGSPLIT-LABEL: wavefront_one_as_release_fence: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: wavefront_one_as_release_fence: @@ -814,10 +884,12 @@ entry: define amdgpu_kernel void @wavefront_one_as_acq_rel_fence() { ; GFX6-LABEL: wavefront_one_as_acq_rel_fence: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: wavefront_one_as_acq_rel_fence: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: wavefront_one_as_acq_rel_fence: @@ -830,10 +902,12 @@ define amdgpu_kernel void @wavefront_one_as_acq_rel_fence() { ; ; SKIP-CACHE-INV-LABEL: wavefront_one_as_acq_rel_fence: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: wavefront_one_as_acq_rel_fence: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: wavefront_one_as_acq_rel_fence: @@ -842,6 +916,7 @@ define amdgpu_kernel void @wavefront_one_as_acq_rel_fence() { ; ; GFX942-NOTTGSPLIT-LABEL: wavefront_one_as_acq_rel_fence: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: wavefront_one_as_acq_rel_fence: @@ -871,10 +946,12 @@ entry: define amdgpu_kernel void @wavefront_one_as_seq_cst_fence() { ; GFX6-LABEL: wavefront_one_as_seq_cst_fence: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: wavefront_one_as_seq_cst_fence: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: wavefront_one_as_seq_cst_fence: @@ -887,10 +964,12 @@ define amdgpu_kernel void @wavefront_one_as_seq_cst_fence() { ; ; SKIP-CACHE-INV-LABEL: wavefront_one_as_seq_cst_fence: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: wavefront_one_as_seq_cst_fence: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: wavefront_one_as_seq_cst_fence: @@ -899,6 +978,7 @@ define amdgpu_kernel void @wavefront_one_as_seq_cst_fence() { ; ; GFX942-NOTTGSPLIT-LABEL: wavefront_one_as_seq_cst_fence: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: wavefront_one_as_seq_cst_fence: @@ -928,12 +1008,12 @@ entry: define amdgpu_kernel void @workgroup_acquire_fence() { ; GFX6-LABEL: workgroup_acquire_fence: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: workgroup_acquire_fence: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: workgroup_acquire_fence: @@ -950,12 +1030,12 @@ define amdgpu_kernel void @workgroup_acquire_fence() { ; ; SKIP-CACHE-INV-LABEL: workgroup_acquire_fence: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: workgroup_acquire_fence: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: workgroup_acquire_fence: @@ -966,7 +1046,7 @@ define amdgpu_kernel void @workgroup_acquire_fence() { ; ; GFX942-NOTTGSPLIT-LABEL: workgroup_acquire_fence: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: workgroup_acquire_fence: @@ -1006,12 +1086,12 @@ entry: define amdgpu_kernel void @workgroup_release_fence() { ; GFX6-LABEL: workgroup_release_fence: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: workgroup_release_fence: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: workgroup_release_fence: @@ -1027,12 +1107,12 @@ define amdgpu_kernel void @workgroup_release_fence() { ; ; SKIP-CACHE-INV-LABEL: workgroup_release_fence: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: workgroup_release_fence: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: workgroup_release_fence: @@ -1042,7 +1122,7 @@ define amdgpu_kernel void @workgroup_release_fence() { ; ; GFX942-NOTTGSPLIT-LABEL: workgroup_release_fence: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: workgroup_release_fence: @@ -1081,12 +1161,12 @@ entry: define amdgpu_kernel void @workgroup_acq_rel_fence() { ; GFX6-LABEL: workgroup_acq_rel_fence: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: workgroup_acq_rel_fence: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: workgroup_acq_rel_fence: @@ -1103,12 +1183,12 @@ define amdgpu_kernel void @workgroup_acq_rel_fence() { ; ; SKIP-CACHE-INV-LABEL: workgroup_acq_rel_fence: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: workgroup_acq_rel_fence: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: workgroup_acq_rel_fence: @@ -1119,7 +1199,7 @@ define amdgpu_kernel void @workgroup_acq_rel_fence() { ; ; GFX942-NOTTGSPLIT-LABEL: workgroup_acq_rel_fence: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: workgroup_acq_rel_fence: @@ -1161,12 +1241,12 @@ entry: define amdgpu_kernel void @workgroup_seq_cst_fence() { ; GFX6-LABEL: workgroup_seq_cst_fence: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: workgroup_seq_cst_fence: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: workgroup_seq_cst_fence: @@ -1183,12 +1263,12 @@ define amdgpu_kernel void @workgroup_seq_cst_fence() { ; ; SKIP-CACHE-INV-LABEL: workgroup_seq_cst_fence: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: workgroup_seq_cst_fence: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: workgroup_seq_cst_fence: @@ -1199,7 +1279,7 @@ define amdgpu_kernel void @workgroup_seq_cst_fence() { ; ; GFX942-NOTTGSPLIT-LABEL: workgroup_seq_cst_fence: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: workgroup_seq_cst_fence: @@ -1241,10 +1321,12 @@ entry: define amdgpu_kernel void @workgroup_one_as_acquire_fence() { ; GFX6-LABEL: workgroup_one_as_acquire_fence: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: workgroup_one_as_acquire_fence: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: workgroup_one_as_acquire_fence: @@ -1260,10 +1342,12 @@ define amdgpu_kernel void @workgroup_one_as_acquire_fence() { ; ; SKIP-CACHE-INV-LABEL: workgroup_one_as_acquire_fence: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: workgroup_one_as_acquire_fence: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: workgroup_one_as_acquire_fence: @@ -1274,6 +1358,7 @@ define amdgpu_kernel void @workgroup_one_as_acquire_fence() { ; ; GFX942-NOTTGSPLIT-LABEL: workgroup_one_as_acquire_fence: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: workgroup_one_as_acquire_fence: @@ -1311,10 +1396,12 @@ entry: define amdgpu_kernel void @workgroup_one_as_release_fence() { ; GFX6-LABEL: workgroup_one_as_release_fence: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: workgroup_one_as_release_fence: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: workgroup_one_as_release_fence: @@ -1329,10 +1416,12 @@ define amdgpu_kernel void @workgroup_one_as_release_fence() { ; ; SKIP-CACHE-INV-LABEL: workgroup_one_as_release_fence: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: workgroup_one_as_release_fence: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: workgroup_one_as_release_fence: @@ -1342,6 +1431,7 @@ define amdgpu_kernel void @workgroup_one_as_release_fence() { ; ; GFX942-NOTTGSPLIT-LABEL: workgroup_one_as_release_fence: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: workgroup_one_as_release_fence: @@ -1378,10 +1468,12 @@ entry: define amdgpu_kernel void @workgroup_one_as_acq_rel_fence() { ; GFX6-LABEL: workgroup_one_as_acq_rel_fence: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: workgroup_one_as_acq_rel_fence: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: workgroup_one_as_acq_rel_fence: @@ -1397,10 +1489,12 @@ define amdgpu_kernel void @workgroup_one_as_acq_rel_fence() { ; ; SKIP-CACHE-INV-LABEL: workgroup_one_as_acq_rel_fence: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: workgroup_one_as_acq_rel_fence: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: workgroup_one_as_acq_rel_fence: @@ -1411,6 +1505,7 @@ define amdgpu_kernel void @workgroup_one_as_acq_rel_fence() { ; ; GFX942-NOTTGSPLIT-LABEL: workgroup_one_as_acq_rel_fence: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: workgroup_one_as_acq_rel_fence: @@ -1450,10 +1545,12 @@ entry: define amdgpu_kernel void @workgroup_one_as_seq_cst_fence() { ; GFX6-LABEL: workgroup_one_as_seq_cst_fence: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: workgroup_one_as_seq_cst_fence: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: workgroup_one_as_seq_cst_fence: @@ -1469,10 +1566,12 @@ define amdgpu_kernel void @workgroup_one_as_seq_cst_fence() { ; ; SKIP-CACHE-INV-LABEL: workgroup_one_as_seq_cst_fence: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: workgroup_one_as_seq_cst_fence: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: workgroup_one_as_seq_cst_fence: @@ -1483,6 +1582,7 @@ define amdgpu_kernel void @workgroup_one_as_seq_cst_fence() { ; ; GFX942-NOTTGSPLIT-LABEL: workgroup_one_as_seq_cst_fence: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: workgroup_one_as_seq_cst_fence: diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-singlethread.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-singlethread.ll index b88a10ab24a98..7e243ad064f7c 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-singlethread.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-singlethread.ll @@ -388,9 +388,10 @@ define amdgpu_kernel void @flat_singlethread_acquire_load( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: flat_load_dword v2, v[0:1] +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -438,9 +439,10 @@ define amdgpu_kernel void @flat_singlethread_acquire_load( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: flat_load_dword v2, v[0:1] +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -453,8 +455,9 @@ define amdgpu_kernel void @flat_singlethread_acquire_load( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -479,8 +482,9 @@ define amdgpu_kernel void @flat_singlethread_acquire_load( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX942-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -569,10 +573,12 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_load( ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_load_dword v2, v[0:1] +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -619,10 +625,12 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_load( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_load_dword v2, v[0:1] +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -634,9 +642,11 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_load( ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -660,9 +670,11 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_load( ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -1050,6 +1062,7 @@ define amdgpu_kernel void @flat_singlethread_release_store( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -1091,6 +1104,7 @@ define amdgpu_kernel void @flat_singlethread_release_store( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -1103,6 +1117,7 @@ define amdgpu_kernel void @flat_singlethread_release_store( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -1125,6 +1140,7 @@ define amdgpu_kernel void @flat_singlethread_release_store( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -1199,6 +1215,7 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_store( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -1240,6 +1257,7 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_store( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -1252,6 +1270,7 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_store( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -1274,6 +1293,7 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_store( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -1498,6 +1518,7 @@ define amdgpu_kernel void @flat_singlethread_acquire_atomicrmw( ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_singlethread_acquire_atomicrmw: @@ -1539,6 +1560,7 @@ define amdgpu_kernel void @flat_singlethread_acquire_atomicrmw( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acquire_atomicrmw: @@ -1551,6 +1573,7 @@ define amdgpu_kernel void @flat_singlethread_acquire_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_acquire_atomicrmw: @@ -1573,6 +1596,7 @@ define amdgpu_kernel void @flat_singlethread_acquire_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_singlethread_acquire_atomicrmw: @@ -1646,6 +1670,7 @@ define amdgpu_kernel void @flat_singlethread_release_atomicrmw( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_swap v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -1687,6 +1712,7 @@ define amdgpu_kernel void @flat_singlethread_release_atomicrmw( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -1699,6 +1725,7 @@ define amdgpu_kernel void @flat_singlethread_release_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -1721,6 +1748,7 @@ define amdgpu_kernel void @flat_singlethread_release_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -1795,7 +1823,9 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_atomicrmw( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_singlethread_acq_rel_atomicrmw: @@ -1836,7 +1866,9 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_atomicrmw( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acq_rel_atomicrmw: @@ -1848,7 +1880,9 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_acq_rel_atomicrmw: @@ -1870,7 +1904,9 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_singlethread_acq_rel_atomicrmw: @@ -1944,7 +1980,9 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_atomicrmw( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_singlethread_seq_cst_atomicrmw: @@ -1985,7 +2023,9 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_atomicrmw( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_atomicrmw: @@ -1997,7 +2037,9 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_seq_cst_atomicrmw: @@ -2019,7 +2061,9 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_singlethread_seq_cst_atomicrmw: @@ -2094,9 +2138,10 @@ define amdgpu_kernel void @flat_singlethread_acquire_ret_atomicrmw( ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s6 ; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -2147,9 +2192,10 @@ define amdgpu_kernel void @flat_singlethread_acquire_ret_atomicrmw( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 ; SKIP-CACHE-INV-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -2163,8 +2209,9 @@ define amdgpu_kernel void @flat_singlethread_acquire_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s6 ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -2191,8 +2238,9 @@ define amdgpu_kernel void @flat_singlethread_acquire_ret_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -2287,10 +2335,12 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_ret_atomicrmw( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s6 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -2340,10 +2390,12 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_ret_atomicrmw( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -2356,9 +2408,11 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -2384,9 +2438,11 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_ret_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -2481,10 +2537,12 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_ret_atomicrmw( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s6 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -2534,10 +2592,12 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_ret_atomicrmw( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -2550,9 +2610,11 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -2578,9 +2640,11 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_ret_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -2928,6 +2992,7 @@ define amdgpu_kernel void @flat_singlethread_acquire_monotonic_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_singlethread_acquire_monotonic_cmpxchg: @@ -3011,6 +3076,7 @@ define amdgpu_kernel void @flat_singlethread_acquire_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acquire_monotonic_cmpxchg: @@ -3027,6 +3093,7 @@ define amdgpu_kernel void @flat_singlethread_acquire_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_acquire_monotonic_cmpxchg: @@ -3057,6 +3124,7 @@ define amdgpu_kernel void @flat_singlethread_acquire_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_singlethread_acquire_monotonic_cmpxchg: @@ -3165,6 +3233,7 @@ define amdgpu_kernel void @flat_singlethread_release_monotonic_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm ; @@ -3248,6 +3317,7 @@ define amdgpu_kernel void @flat_singlethread_release_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -3264,6 +3334,7 @@ define amdgpu_kernel void @flat_singlethread_release_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -3294,6 +3365,7 @@ define amdgpu_kernel void @flat_singlethread_release_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -3403,7 +3475,9 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_singlethread_acq_rel_monotonic_cmpxchg: @@ -3486,7 +3560,9 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acq_rel_monotonic_cmpxchg: @@ -3502,7 +3578,9 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_acq_rel_monotonic_cmpxchg: @@ -3532,7 +3610,9 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_singlethread_acq_rel_monotonic_cmpxchg: @@ -3641,7 +3721,9 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_singlethread_seq_cst_monotonic_cmpxchg: @@ -3724,7 +3806,9 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_monotonic_cmpxchg: @@ -3740,7 +3824,9 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_seq_cst_monotonic_cmpxchg: @@ -3770,7 +3856,9 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_singlethread_seq_cst_monotonic_cmpxchg: @@ -3880,6 +3968,7 @@ define amdgpu_kernel void @flat_singlethread_monotonic_acquire_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_singlethread_monotonic_acquire_cmpxchg: @@ -3963,6 +4052,7 @@ define amdgpu_kernel void @flat_singlethread_monotonic_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_acquire_cmpxchg: @@ -3979,6 +4069,7 @@ define amdgpu_kernel void @flat_singlethread_monotonic_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_monotonic_acquire_cmpxchg: @@ -4009,6 +4100,7 @@ define amdgpu_kernel void @flat_singlethread_monotonic_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_singlethread_monotonic_acquire_cmpxchg: @@ -4118,6 +4210,7 @@ define amdgpu_kernel void @flat_singlethread_acquire_acquire_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_singlethread_acquire_acquire_cmpxchg: @@ -4201,6 +4294,7 @@ define amdgpu_kernel void @flat_singlethread_acquire_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acquire_acquire_cmpxchg: @@ -4217,6 +4311,7 @@ define amdgpu_kernel void @flat_singlethread_acquire_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_acquire_acquire_cmpxchg: @@ -4247,6 +4342,7 @@ define amdgpu_kernel void @flat_singlethread_acquire_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_singlethread_acquire_acquire_cmpxchg: @@ -4355,7 +4451,9 @@ define amdgpu_kernel void @flat_singlethread_release_acquire_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_singlethread_release_acquire_cmpxchg: @@ -4438,7 +4536,9 @@ define amdgpu_kernel void @flat_singlethread_release_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_release_acquire_cmpxchg: @@ -4454,7 +4554,9 @@ define amdgpu_kernel void @flat_singlethread_release_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_release_acquire_cmpxchg: @@ -4484,7 +4586,9 @@ define amdgpu_kernel void @flat_singlethread_release_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_singlethread_release_acquire_cmpxchg: @@ -4593,7 +4697,9 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_singlethread_acq_rel_acquire_cmpxchg: @@ -4676,7 +4782,9 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acq_rel_acquire_cmpxchg: @@ -4692,7 +4800,9 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_acq_rel_acquire_cmpxchg: @@ -4722,7 +4832,9 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_singlethread_acq_rel_acquire_cmpxchg: @@ -4831,7 +4943,9 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_singlethread_seq_cst_acquire_cmpxchg: @@ -4914,7 +5028,9 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_acquire_cmpxchg: @@ -4930,7 +5046,9 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_seq_cst_acquire_cmpxchg: @@ -4960,7 +5078,9 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_singlethread_seq_cst_acquire_cmpxchg: @@ -5069,7 +5189,9 @@ define amdgpu_kernel void @flat_singlethread_monotonic_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_singlethread_monotonic_seq_cst_cmpxchg: @@ -5152,7 +5274,9 @@ define amdgpu_kernel void @flat_singlethread_monotonic_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_seq_cst_cmpxchg: @@ -5168,7 +5292,9 @@ define amdgpu_kernel void @flat_singlethread_monotonic_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_monotonic_seq_cst_cmpxchg: @@ -5198,7 +5324,9 @@ define amdgpu_kernel void @flat_singlethread_monotonic_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_singlethread_monotonic_seq_cst_cmpxchg: @@ -5307,7 +5435,9 @@ define amdgpu_kernel void @flat_singlethread_acquire_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_singlethread_acquire_seq_cst_cmpxchg: @@ -5390,7 +5520,9 @@ define amdgpu_kernel void @flat_singlethread_acquire_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acquire_seq_cst_cmpxchg: @@ -5406,7 +5538,9 @@ define amdgpu_kernel void @flat_singlethread_acquire_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_acquire_seq_cst_cmpxchg: @@ -5436,7 +5570,9 @@ define amdgpu_kernel void @flat_singlethread_acquire_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_singlethread_acquire_seq_cst_cmpxchg: @@ -5545,7 +5681,9 @@ define amdgpu_kernel void @flat_singlethread_release_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_singlethread_release_seq_cst_cmpxchg: @@ -5628,7 +5766,9 @@ define amdgpu_kernel void @flat_singlethread_release_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_release_seq_cst_cmpxchg: @@ -5644,7 +5784,9 @@ define amdgpu_kernel void @flat_singlethread_release_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_release_seq_cst_cmpxchg: @@ -5674,7 +5816,9 @@ define amdgpu_kernel void @flat_singlethread_release_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_singlethread_release_seq_cst_cmpxchg: @@ -5783,7 +5927,9 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_singlethread_acq_rel_seq_cst_cmpxchg: @@ -5866,7 +6012,9 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acq_rel_seq_cst_cmpxchg: @@ -5882,7 +6030,9 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_acq_rel_seq_cst_cmpxchg: @@ -5912,7 +6062,9 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_singlethread_acq_rel_seq_cst_cmpxchg: @@ -6021,7 +6173,9 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_singlethread_seq_cst_seq_cst_cmpxchg: @@ -6104,7 +6258,9 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_seq_cst_cmpxchg: @@ -6120,7 +6276,9 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_seq_cst_seq_cst_cmpxchg: @@ -6150,7 +6308,9 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_singlethread_seq_cst_seq_cst_cmpxchg: @@ -6544,9 +6704,10 @@ define amdgpu_kernel void @flat_singlethread_acquire_monotonic_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -6639,9 +6800,10 @@ define amdgpu_kernel void @flat_singlethread_acquire_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -6659,8 +6821,9 @@ define amdgpu_kernel void @flat_singlethread_acquire_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -6695,8 +6858,9 @@ define amdgpu_kernel void @flat_singlethread_acquire_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -6827,6 +6991,7 @@ define amdgpu_kernel void @flat_singlethread_release_monotonic_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -6922,6 +7087,7 @@ define amdgpu_kernel void @flat_singlethread_release_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 @@ -6942,6 +7108,7 @@ define amdgpu_kernel void @flat_singlethread_release_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -6978,6 +7145,7 @@ define amdgpu_kernel void @flat_singlethread_release_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -7111,10 +7279,12 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -7206,10 +7376,12 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -7226,9 +7398,11 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -7262,9 +7436,11 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -7395,10 +7571,12 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -7490,10 +7668,12 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -7510,9 +7690,11 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -7546,9 +7728,11 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -7680,9 +7864,10 @@ define amdgpu_kernel void @flat_singlethread_monotonic_acquire_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -7775,9 +7960,10 @@ define amdgpu_kernel void @flat_singlethread_monotonic_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -7795,8 +7981,9 @@ define amdgpu_kernel void @flat_singlethread_monotonic_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -7831,8 +8018,9 @@ define amdgpu_kernel void @flat_singlethread_monotonic_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -7964,9 +8152,10 @@ define amdgpu_kernel void @flat_singlethread_acquire_acquire_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -8059,9 +8248,10 @@ define amdgpu_kernel void @flat_singlethread_acquire_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -8079,8 +8269,9 @@ define amdgpu_kernel void @flat_singlethread_acquire_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -8115,8 +8306,9 @@ define amdgpu_kernel void @flat_singlethread_acquire_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -8247,10 +8439,12 @@ define amdgpu_kernel void @flat_singlethread_release_acquire_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -8342,10 +8536,12 @@ define amdgpu_kernel void @flat_singlethread_release_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -8362,9 +8558,11 @@ define amdgpu_kernel void @flat_singlethread_release_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -8398,9 +8596,11 @@ define amdgpu_kernel void @flat_singlethread_release_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -8531,10 +8731,12 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -8626,10 +8828,12 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -8646,9 +8850,11 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -8682,9 +8888,11 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -8815,10 +9023,12 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -8910,10 +9120,12 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -8930,9 +9142,11 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -8966,9 +9180,11 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -9099,10 +9315,12 @@ define amdgpu_kernel void @flat_singlethread_monotonic_seq_cst_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -9194,10 +9412,12 @@ define amdgpu_kernel void @flat_singlethread_monotonic_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -9214,9 +9434,11 @@ define amdgpu_kernel void @flat_singlethread_monotonic_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -9250,9 +9472,11 @@ define amdgpu_kernel void @flat_singlethread_monotonic_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -9383,10 +9607,12 @@ define amdgpu_kernel void @flat_singlethread_acquire_seq_cst_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -9478,10 +9704,12 @@ define amdgpu_kernel void @flat_singlethread_acquire_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -9498,9 +9726,11 @@ define amdgpu_kernel void @flat_singlethread_acquire_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -9534,9 +9764,11 @@ define amdgpu_kernel void @flat_singlethread_acquire_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -9667,10 +9899,12 @@ define amdgpu_kernel void @flat_singlethread_release_seq_cst_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -9762,10 +9996,12 @@ define amdgpu_kernel void @flat_singlethread_release_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -9782,9 +10018,11 @@ define amdgpu_kernel void @flat_singlethread_release_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -9818,9 +10056,11 @@ define amdgpu_kernel void @flat_singlethread_release_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -9951,10 +10191,12 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_seq_cst_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -10046,10 +10288,12 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -10066,9 +10310,11 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -10102,9 +10348,11 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -10235,10 +10483,12 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -10330,10 +10580,12 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -10350,9 +10602,11 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -10386,9 +10640,11 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -10869,9 +11125,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_load( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: flat_load_dword v2, v[0:1] +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -10919,9 +11176,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_load( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: flat_load_dword v2, v[0:1] +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -10934,8 +11192,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_load( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -10960,8 +11219,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_load( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX942-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -11050,10 +11310,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_load( ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_load_dword v2, v[0:1] +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -11100,10 +11362,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_load( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_load_dword v2, v[0:1] +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -11115,9 +11379,11 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_load( ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -11141,9 +11407,11 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_load( ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -11531,6 +11799,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_store( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -11572,6 +11841,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_store( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -11584,6 +11854,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_store( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -11606,6 +11877,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_store( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -11680,6 +11952,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_store( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -11721,6 +11994,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_store( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -11733,6 +12007,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_store( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -11755,6 +12030,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_store( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -11979,6 +12255,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_atomicrmw( ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_acquire_atomicrmw: @@ -12020,6 +12297,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_atomicrmw( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_atomicrmw: @@ -12032,6 +12310,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_atomicrmw: @@ -12054,6 +12333,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_atomicrmw: @@ -12127,6 +12407,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_atomicrmw( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_swap v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -12168,6 +12449,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_atomicrmw( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -12180,6 +12462,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -12202,6 +12485,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -12276,7 +12560,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_atomicrmw( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_acq_rel_atomicrmw: @@ -12317,7 +12603,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_atomicrmw( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_atomicrmw: @@ -12329,7 +12617,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_atomicrmw: @@ -12351,7 +12641,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_atomicrmw: @@ -12425,7 +12717,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_atomicrmw( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_seq_cst_atomicrmw: @@ -12466,7 +12760,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_atomicrmw( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_atomicrmw: @@ -12478,7 +12774,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_atomicrmw: @@ -12500,7 +12798,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_atomicrmw: @@ -12575,9 +12875,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_ret_atomicrmw( ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s6 ; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -12628,9 +12929,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_ret_atomicrmw( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 ; SKIP-CACHE-INV-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -12644,8 +12946,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s6 ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -12672,8 +12975,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_ret_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -12768,10 +13072,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_ret_atomicrmw( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s6 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -12821,10 +13127,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_ret_atomicrmw( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -12837,9 +13145,11 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -12865,9 +13175,11 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_ret_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -12962,10 +13274,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_ret_atomicrmw( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s6 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -13015,10 +13329,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_ret_atomicrmw( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -13031,9 +13347,11 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -13059,9 +13377,11 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_ret_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -13409,6 +13729,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_acquire_monotonic_cmpxchg: @@ -13492,6 +13813,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_monotonic_cmpxchg: @@ -13508,6 +13830,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_monotonic_cmpxchg: @@ -13538,6 +13861,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_monotonic_cmpxchg: @@ -13646,6 +13970,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm ; @@ -13729,6 +14054,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -13745,6 +14071,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -13775,6 +14102,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -13884,7 +14212,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_acq_rel_monotonic_cmpxchg: @@ -13967,7 +14297,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_monotonic_cmpxchg: @@ -13983,7 +14315,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_monotonic_cmpxchg: @@ -14013,7 +14347,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_monotonic_cmpxchg: @@ -14122,7 +14458,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_seq_cst_monotonic_cmpxchg: @@ -14205,7 +14543,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_monotonic_cmpxchg: @@ -14221,7 +14561,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_monotonic_cmpxchg: @@ -14251,7 +14593,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_monotonic_cmpxchg: @@ -14361,6 +14705,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_acquire_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_monotonic_acquire_cmpxchg: @@ -14444,6 +14789,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_acquire_cmpxchg: @@ -14460,6 +14806,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_acquire_cmpxchg: @@ -14490,6 +14837,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_acquire_cmpxchg: @@ -14599,6 +14947,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_acquire_acquire_cmpxchg: @@ -14682,6 +15031,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_acquire_cmpxchg: @@ -14698,6 +15048,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_acquire_cmpxchg: @@ -14728,6 +15079,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_acquire_cmpxchg: @@ -14836,7 +15188,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_release_acquire_cmpxchg: @@ -14919,7 +15273,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_release_acquire_cmpxchg: @@ -14935,7 +15291,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_release_acquire_cmpxchg: @@ -14965,7 +15323,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_singlethread_one_as_release_acquire_cmpxchg: @@ -15074,7 +15434,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_acq_rel_acquire_cmpxchg: @@ -15157,7 +15519,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_acquire_cmpxchg: @@ -15173,7 +15537,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_acquire_cmpxchg: @@ -15203,7 +15569,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_acquire_cmpxchg: @@ -15312,7 +15680,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_seq_cst_acquire_cmpxchg: @@ -15395,7 +15765,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_acquire_cmpxchg: @@ -15411,7 +15783,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_acquire_cmpxchg: @@ -15441,7 +15815,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_acquire_cmpxchg: @@ -15550,7 +15926,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_monotonic_seq_cst_cmpxchg: @@ -15633,7 +16011,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_seq_cst_cmpxchg: @@ -15649,7 +16029,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_seq_cst_cmpxchg: @@ -15679,7 +16061,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_seq_cst_cmpxchg: @@ -15788,7 +16172,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_acquire_seq_cst_cmpxchg: @@ -15871,7 +16257,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_seq_cst_cmpxchg: @@ -15887,7 +16275,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_seq_cst_cmpxchg: @@ -15917,7 +16307,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_seq_cst_cmpxchg: @@ -16026,7 +16418,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_release_seq_cst_cmpxchg: @@ -16109,7 +16503,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_release_seq_cst_cmpxchg: @@ -16125,7 +16521,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_release_seq_cst_cmpxchg: @@ -16155,7 +16553,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_singlethread_one_as_release_seq_cst_cmpxchg: @@ -16264,7 +16664,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg: @@ -16347,7 +16749,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg: @@ -16363,7 +16767,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg: @@ -16393,7 +16799,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg: @@ -16502,7 +16910,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg: @@ -16585,7 +16995,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg: @@ -16601,7 +17013,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg: @@ -16631,7 +17045,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg: @@ -17025,9 +17441,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_ret_cmpxch ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -17120,9 +17537,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_ret_cmpxch ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -17140,8 +17558,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_ret_cmpxch ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -17176,8 +17595,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_ret_cmpxch ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -17308,6 +17728,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_ret_cmpxch ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -17403,6 +17824,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_ret_cmpxch ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 @@ -17423,6 +17845,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_ret_cmpxch ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -17459,6 +17882,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_ret_cmpxch ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -17592,10 +18016,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxch ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -17687,10 +18113,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxch ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -17707,9 +18135,11 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxch ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -17743,9 +18173,11 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxch ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -17876,10 +18308,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxch ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -17971,10 +18405,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxch ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -17991,9 +18427,11 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxch ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -18027,9 +18465,11 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxch ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -18161,9 +18601,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_acquire_ret_cmpxch ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -18256,9 +18697,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_acquire_ret_cmpxch ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -18276,8 +18718,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_acquire_ret_cmpxch ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -18312,8 +18755,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_acquire_ret_cmpxch ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -18445,9 +18889,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -18540,9 +18985,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -18560,8 +19006,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -18596,8 +19043,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -18728,10 +19176,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -18823,10 +19273,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -18843,9 +19295,11 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -18879,9 +19333,11 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -19012,10 +19468,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -19107,10 +19565,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -19127,9 +19587,11 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -19163,9 +19625,11 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -19296,10 +19760,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -19391,10 +19857,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -19411,9 +19879,11 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -19447,9 +19917,11 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -19580,10 +20052,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxch ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -19675,10 +20149,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxch ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -19695,9 +20171,11 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxch ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -19731,9 +20209,11 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxch ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -19864,10 +20344,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -19959,10 +20441,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -19979,9 +20463,11 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -20015,9 +20501,11 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -20148,10 +20636,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_seq_cst_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -20243,10 +20733,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -20263,9 +20755,11 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -20299,9 +20793,11 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -20432,10 +20928,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -20527,10 +21025,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -20547,9 +21047,11 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -20583,9 +21085,11 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -20716,10 +21220,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -20811,10 +21317,12 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -20831,9 +21339,11 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -20867,9 +21377,11 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-volatile.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-volatile.ll index a88e0e217fdb4..e9475cefffab4 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-volatile.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-volatile.ll @@ -852,10 +852,9 @@ define amdgpu_kernel void @flat_volatile_workgroup_acquire_load( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: flat_load_dword v2, v[0:1] -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -905,10 +904,9 @@ define amdgpu_kernel void @flat_volatile_workgroup_acquire_load( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: flat_load_dword v2, v[0:1] -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -990,7 +988,7 @@ define amdgpu_kernel void @flat_volatile_workgroup_release_store( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -1035,7 +1033,7 @@ define amdgpu_kernel void @flat_volatile_workgroup_release_store( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-wavefront.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-wavefront.ll index 7c637a20ab47b..d9177a0faf0e1 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-wavefront.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-wavefront.ll @@ -388,9 +388,10 @@ define amdgpu_kernel void @flat_wavefront_acquire_load( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: flat_load_dword v2, v[0:1] +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -438,9 +439,10 @@ define amdgpu_kernel void @flat_wavefront_acquire_load( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: flat_load_dword v2, v[0:1] +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -453,8 +455,9 @@ define amdgpu_kernel void @flat_wavefront_acquire_load( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -479,8 +482,9 @@ define amdgpu_kernel void @flat_wavefront_acquire_load( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX942-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -569,10 +573,12 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_load( ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_load_dword v2, v[0:1] +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -619,10 +625,12 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_load( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_load_dword v2, v[0:1] +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -634,9 +642,11 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_load( ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -660,9 +670,11 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_load( ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -1050,6 +1062,7 @@ define amdgpu_kernel void @flat_wavefront_release_store( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -1091,6 +1104,7 @@ define amdgpu_kernel void @flat_wavefront_release_store( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -1103,6 +1117,7 @@ define amdgpu_kernel void @flat_wavefront_release_store( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -1125,6 +1140,7 @@ define amdgpu_kernel void @flat_wavefront_release_store( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -1199,6 +1215,7 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_store( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -1240,6 +1257,7 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_store( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -1252,6 +1270,7 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_store( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -1274,6 +1293,7 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_store( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -1498,6 +1518,7 @@ define amdgpu_kernel void @flat_wavefront_acquire_atomicrmw( ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_wavefront_acquire_atomicrmw: @@ -1539,6 +1560,7 @@ define amdgpu_kernel void @flat_wavefront_acquire_atomicrmw( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acquire_atomicrmw: @@ -1551,6 +1573,7 @@ define amdgpu_kernel void @flat_wavefront_acquire_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_acquire_atomicrmw: @@ -1573,6 +1596,7 @@ define amdgpu_kernel void @flat_wavefront_acquire_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_wavefront_acquire_atomicrmw: @@ -1646,6 +1670,7 @@ define amdgpu_kernel void @flat_wavefront_release_atomicrmw( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_swap v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -1687,6 +1712,7 @@ define amdgpu_kernel void @flat_wavefront_release_atomicrmw( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -1699,6 +1725,7 @@ define amdgpu_kernel void @flat_wavefront_release_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -1721,6 +1748,7 @@ define amdgpu_kernel void @flat_wavefront_release_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -1795,7 +1823,9 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_atomicrmw( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_wavefront_acq_rel_atomicrmw: @@ -1836,7 +1866,9 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_atomicrmw( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acq_rel_atomicrmw: @@ -1848,7 +1880,9 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_acq_rel_atomicrmw: @@ -1870,7 +1904,9 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_wavefront_acq_rel_atomicrmw: @@ -1944,7 +1980,9 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_atomicrmw( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_wavefront_seq_cst_atomicrmw: @@ -1985,7 +2023,9 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_atomicrmw( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_seq_cst_atomicrmw: @@ -1997,7 +2037,9 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_seq_cst_atomicrmw: @@ -2019,7 +2061,9 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_wavefront_seq_cst_atomicrmw: @@ -2094,9 +2138,10 @@ define amdgpu_kernel void @flat_wavefront_acquire_ret_atomicrmw( ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s6 ; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -2147,9 +2192,10 @@ define amdgpu_kernel void @flat_wavefront_acquire_ret_atomicrmw( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 ; SKIP-CACHE-INV-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -2163,8 +2209,9 @@ define amdgpu_kernel void @flat_wavefront_acquire_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s6 ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -2191,8 +2238,9 @@ define amdgpu_kernel void @flat_wavefront_acquire_ret_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -2287,10 +2335,12 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_ret_atomicrmw( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s6 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -2340,10 +2390,12 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_ret_atomicrmw( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -2356,9 +2408,11 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -2384,9 +2438,11 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_ret_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -2481,10 +2537,12 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_ret_atomicrmw( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s6 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -2534,10 +2592,12 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_ret_atomicrmw( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -2550,9 +2610,11 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -2578,9 +2640,11 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_ret_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -2928,6 +2992,7 @@ define amdgpu_kernel void @flat_wavefront_acquire_monotonic_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_wavefront_acquire_monotonic_cmpxchg: @@ -3011,6 +3076,7 @@ define amdgpu_kernel void @flat_wavefront_acquire_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acquire_monotonic_cmpxchg: @@ -3027,6 +3093,7 @@ define amdgpu_kernel void @flat_wavefront_acquire_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_acquire_monotonic_cmpxchg: @@ -3057,6 +3124,7 @@ define amdgpu_kernel void @flat_wavefront_acquire_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_wavefront_acquire_monotonic_cmpxchg: @@ -3165,6 +3233,7 @@ define amdgpu_kernel void @flat_wavefront_release_monotonic_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm ; @@ -3248,6 +3317,7 @@ define amdgpu_kernel void @flat_wavefront_release_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -3264,6 +3334,7 @@ define amdgpu_kernel void @flat_wavefront_release_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -3294,6 +3365,7 @@ define amdgpu_kernel void @flat_wavefront_release_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -3403,7 +3475,9 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_monotonic_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_wavefront_acq_rel_monotonic_cmpxchg: @@ -3486,7 +3560,9 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acq_rel_monotonic_cmpxchg: @@ -3502,7 +3578,9 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_acq_rel_monotonic_cmpxchg: @@ -3532,7 +3610,9 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_wavefront_acq_rel_monotonic_cmpxchg: @@ -3641,7 +3721,9 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_monotonic_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_wavefront_seq_cst_monotonic_cmpxchg: @@ -3724,7 +3806,9 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_seq_cst_monotonic_cmpxchg: @@ -3740,7 +3824,9 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_seq_cst_monotonic_cmpxchg: @@ -3770,7 +3856,9 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_wavefront_seq_cst_monotonic_cmpxchg: @@ -3880,6 +3968,7 @@ define amdgpu_kernel void @flat_wavefront_monotonic_acquire_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_wavefront_monotonic_acquire_cmpxchg: @@ -3963,6 +4052,7 @@ define amdgpu_kernel void @flat_wavefront_monotonic_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_monotonic_acquire_cmpxchg: @@ -3979,6 +4069,7 @@ define amdgpu_kernel void @flat_wavefront_monotonic_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_monotonic_acquire_cmpxchg: @@ -4009,6 +4100,7 @@ define amdgpu_kernel void @flat_wavefront_monotonic_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_wavefront_monotonic_acquire_cmpxchg: @@ -4118,6 +4210,7 @@ define amdgpu_kernel void @flat_wavefront_acquire_acquire_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_wavefront_acquire_acquire_cmpxchg: @@ -4201,6 +4294,7 @@ define amdgpu_kernel void @flat_wavefront_acquire_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acquire_acquire_cmpxchg: @@ -4217,6 +4311,7 @@ define amdgpu_kernel void @flat_wavefront_acquire_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_acquire_acquire_cmpxchg: @@ -4247,6 +4342,7 @@ define amdgpu_kernel void @flat_wavefront_acquire_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_wavefront_acquire_acquire_cmpxchg: @@ -4355,7 +4451,9 @@ define amdgpu_kernel void @flat_wavefront_release_acquire_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_wavefront_release_acquire_cmpxchg: @@ -4438,7 +4536,9 @@ define amdgpu_kernel void @flat_wavefront_release_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_release_acquire_cmpxchg: @@ -4454,7 +4554,9 @@ define amdgpu_kernel void @flat_wavefront_release_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_release_acquire_cmpxchg: @@ -4484,7 +4586,9 @@ define amdgpu_kernel void @flat_wavefront_release_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_wavefront_release_acquire_cmpxchg: @@ -4593,7 +4697,9 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_acquire_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_wavefront_acq_rel_acquire_cmpxchg: @@ -4676,7 +4782,9 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acq_rel_acquire_cmpxchg: @@ -4692,7 +4800,9 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_acq_rel_acquire_cmpxchg: @@ -4722,7 +4832,9 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_wavefront_acq_rel_acquire_cmpxchg: @@ -4831,7 +4943,9 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_acquire_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_wavefront_seq_cst_acquire_cmpxchg: @@ -4914,7 +5028,9 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_seq_cst_acquire_cmpxchg: @@ -4930,7 +5046,9 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_seq_cst_acquire_cmpxchg: @@ -4960,7 +5078,9 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_wavefront_seq_cst_acquire_cmpxchg: @@ -5069,7 +5189,9 @@ define amdgpu_kernel void @flat_wavefront_monotonic_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_wavefront_monotonic_seq_cst_cmpxchg: @@ -5152,7 +5274,9 @@ define amdgpu_kernel void @flat_wavefront_monotonic_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_monotonic_seq_cst_cmpxchg: @@ -5168,7 +5292,9 @@ define amdgpu_kernel void @flat_wavefront_monotonic_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_monotonic_seq_cst_cmpxchg: @@ -5198,7 +5324,9 @@ define amdgpu_kernel void @flat_wavefront_monotonic_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_wavefront_monotonic_seq_cst_cmpxchg: @@ -5307,7 +5435,9 @@ define amdgpu_kernel void @flat_wavefront_acquire_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_wavefront_acquire_seq_cst_cmpxchg: @@ -5390,7 +5520,9 @@ define amdgpu_kernel void @flat_wavefront_acquire_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acquire_seq_cst_cmpxchg: @@ -5406,7 +5538,9 @@ define amdgpu_kernel void @flat_wavefront_acquire_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_acquire_seq_cst_cmpxchg: @@ -5436,7 +5570,9 @@ define amdgpu_kernel void @flat_wavefront_acquire_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_wavefront_acquire_seq_cst_cmpxchg: @@ -5545,7 +5681,9 @@ define amdgpu_kernel void @flat_wavefront_release_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_wavefront_release_seq_cst_cmpxchg: @@ -5628,7 +5766,9 @@ define amdgpu_kernel void @flat_wavefront_release_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_release_seq_cst_cmpxchg: @@ -5644,7 +5784,9 @@ define amdgpu_kernel void @flat_wavefront_release_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_release_seq_cst_cmpxchg: @@ -5674,7 +5816,9 @@ define amdgpu_kernel void @flat_wavefront_release_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_wavefront_release_seq_cst_cmpxchg: @@ -5783,7 +5927,9 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_wavefront_acq_rel_seq_cst_cmpxchg: @@ -5866,7 +6012,9 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acq_rel_seq_cst_cmpxchg: @@ -5882,7 +6030,9 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_acq_rel_seq_cst_cmpxchg: @@ -5912,7 +6062,9 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_wavefront_acq_rel_seq_cst_cmpxchg: @@ -6021,7 +6173,9 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_wavefront_seq_cst_seq_cst_cmpxchg: @@ -6104,7 +6258,9 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_seq_cst_seq_cst_cmpxchg: @@ -6120,7 +6276,9 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_seq_cst_seq_cst_cmpxchg: @@ -6150,7 +6308,9 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_wavefront_seq_cst_seq_cst_cmpxchg: @@ -6544,9 +6704,10 @@ define amdgpu_kernel void @flat_wavefront_acquire_monotonic_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -6639,9 +6800,10 @@ define amdgpu_kernel void @flat_wavefront_acquire_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -6659,8 +6821,9 @@ define amdgpu_kernel void @flat_wavefront_acquire_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -6695,8 +6858,9 @@ define amdgpu_kernel void @flat_wavefront_acquire_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -6827,6 +6991,7 @@ define amdgpu_kernel void @flat_wavefront_release_monotonic_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -6922,6 +7087,7 @@ define amdgpu_kernel void @flat_wavefront_release_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 @@ -6942,6 +7108,7 @@ define amdgpu_kernel void @flat_wavefront_release_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -6978,6 +7145,7 @@ define amdgpu_kernel void @flat_wavefront_release_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -7111,10 +7279,12 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_monotonic_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -7206,10 +7376,12 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -7226,9 +7398,11 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -7262,9 +7436,11 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -7395,10 +7571,12 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_monotonic_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -7490,10 +7668,12 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -7510,9 +7690,11 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -7546,9 +7728,11 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -7680,9 +7864,10 @@ define amdgpu_kernel void @flat_wavefront_monotonic_acquire_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -7775,9 +7960,10 @@ define amdgpu_kernel void @flat_wavefront_monotonic_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -7795,8 +7981,9 @@ define amdgpu_kernel void @flat_wavefront_monotonic_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -7831,8 +8018,9 @@ define amdgpu_kernel void @flat_wavefront_monotonic_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -7964,9 +8152,10 @@ define amdgpu_kernel void @flat_wavefront_acquire_acquire_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -8059,9 +8248,10 @@ define amdgpu_kernel void @flat_wavefront_acquire_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -8079,8 +8269,9 @@ define amdgpu_kernel void @flat_wavefront_acquire_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -8115,8 +8306,9 @@ define amdgpu_kernel void @flat_wavefront_acquire_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -8247,10 +8439,12 @@ define amdgpu_kernel void @flat_wavefront_release_acquire_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -8342,10 +8536,12 @@ define amdgpu_kernel void @flat_wavefront_release_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -8362,9 +8558,11 @@ define amdgpu_kernel void @flat_wavefront_release_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -8398,9 +8596,11 @@ define amdgpu_kernel void @flat_wavefront_release_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -8531,10 +8731,12 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_acquire_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -8626,10 +8828,12 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -8646,9 +8850,11 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -8682,9 +8888,11 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -8815,10 +9023,12 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_acquire_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -8910,10 +9120,12 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -8930,9 +9142,11 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -8966,9 +9180,11 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -9099,10 +9315,12 @@ define amdgpu_kernel void @flat_wavefront_monotonic_seq_cst_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -9194,10 +9412,12 @@ define amdgpu_kernel void @flat_wavefront_monotonic_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -9214,9 +9434,11 @@ define amdgpu_kernel void @flat_wavefront_monotonic_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -9250,9 +9472,11 @@ define amdgpu_kernel void @flat_wavefront_monotonic_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -9383,10 +9607,12 @@ define amdgpu_kernel void @flat_wavefront_acquire_seq_cst_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -9478,10 +9704,12 @@ define amdgpu_kernel void @flat_wavefront_acquire_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -9498,9 +9726,11 @@ define amdgpu_kernel void @flat_wavefront_acquire_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -9534,9 +9764,11 @@ define amdgpu_kernel void @flat_wavefront_acquire_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -9667,10 +9899,12 @@ define amdgpu_kernel void @flat_wavefront_release_seq_cst_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -9762,10 +9996,12 @@ define amdgpu_kernel void @flat_wavefront_release_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -9782,9 +10018,11 @@ define amdgpu_kernel void @flat_wavefront_release_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -9818,9 +10056,11 @@ define amdgpu_kernel void @flat_wavefront_release_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -9951,10 +10191,12 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_seq_cst_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -10046,10 +10288,12 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -10066,9 +10310,11 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -10102,9 +10348,11 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -10235,10 +10483,12 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_seq_cst_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -10330,10 +10580,12 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -10350,9 +10602,11 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -10386,9 +10640,11 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -10869,9 +11125,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_load( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: flat_load_dword v2, v[0:1] +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -10919,9 +11176,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_load( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: flat_load_dword v2, v[0:1] +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -10934,8 +11192,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_load( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -10960,8 +11219,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_load( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX942-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -11050,10 +11310,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_load( ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_load_dword v2, v[0:1] +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -11100,10 +11362,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_load( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_load_dword v2, v[0:1] +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -11115,9 +11379,11 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_load( ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -11141,9 +11407,11 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_load( ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -11531,6 +11799,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_store( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -11572,6 +11841,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_store( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -11584,6 +11854,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_store( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -11606,6 +11877,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_store( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -11680,6 +11952,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_store( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -11721,6 +11994,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_store( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -11733,6 +12007,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_store( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -11755,6 +12030,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_store( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -11979,6 +12255,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_atomicrmw( ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_acquire_atomicrmw: @@ -12020,6 +12297,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_atomicrmw( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acquire_atomicrmw: @@ -12032,6 +12310,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acquire_atomicrmw: @@ -12054,6 +12333,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_wavefront_one_as_acquire_atomicrmw: @@ -12127,6 +12407,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_atomicrmw( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_swap v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -12168,6 +12449,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_atomicrmw( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -12180,6 +12462,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -12202,6 +12485,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -12276,7 +12560,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_atomicrmw( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_acq_rel_atomicrmw: @@ -12317,7 +12603,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_atomicrmw( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_atomicrmw: @@ -12329,7 +12617,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_atomicrmw: @@ -12351,7 +12641,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_atomicrmw: @@ -12425,7 +12717,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_atomicrmw( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_seq_cst_atomicrmw: @@ -12466,7 +12760,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_atomicrmw( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_atomicrmw: @@ -12478,7 +12774,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_atomicrmw: @@ -12500,7 +12798,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_atomicrmw: @@ -12575,9 +12875,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_ret_atomicrmw( ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s6 ; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -12628,9 +12929,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_ret_atomicrmw( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 ; SKIP-CACHE-INV-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -12644,8 +12946,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s6 ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -12672,8 +12975,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_ret_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -12768,10 +13072,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_ret_atomicrmw( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s6 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -12821,10 +13127,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_ret_atomicrmw( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -12837,9 +13145,11 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -12865,9 +13175,11 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_ret_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -12962,10 +13274,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_ret_atomicrmw( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s6 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -13015,10 +13329,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_ret_atomicrmw( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -13031,9 +13347,11 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -13059,9 +13377,11 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_ret_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -13409,6 +13729,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_monotonic_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_acquire_monotonic_cmpxchg: @@ -13492,6 +13813,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acquire_monotonic_cmpxchg: @@ -13508,6 +13830,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acquire_monotonic_cmpxchg: @@ -13538,6 +13861,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_wavefront_one_as_acquire_monotonic_cmpxchg: @@ -13646,6 +13970,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_monotonic_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm ; @@ -13729,6 +14054,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -13745,6 +14071,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -13775,6 +14102,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -13884,7 +14212,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_monotonic_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_acq_rel_monotonic_cmpxchg: @@ -13967,7 +14297,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_monotonic_cmpxchg: @@ -13983,7 +14315,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_monotonic_cmpxchg: @@ -14013,7 +14347,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_monotonic_cmpxchg: @@ -14122,7 +14458,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_monotonic_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_seq_cst_monotonic_cmpxchg: @@ -14205,7 +14543,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_monotonic_cmpxchg: @@ -14221,7 +14561,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_monotonic_cmpxchg: @@ -14251,7 +14593,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_monotonic_cmpxchg: @@ -14361,6 +14705,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_acquire_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_monotonic_acquire_cmpxchg: @@ -14444,6 +14789,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_monotonic_acquire_cmpxchg: @@ -14460,6 +14806,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_monotonic_acquire_cmpxchg: @@ -14490,6 +14837,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_wavefront_one_as_monotonic_acquire_cmpxchg: @@ -14599,6 +14947,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_acquire_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_acquire_acquire_cmpxchg: @@ -14682,6 +15031,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acquire_acquire_cmpxchg: @@ -14698,6 +15048,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acquire_acquire_cmpxchg: @@ -14728,6 +15079,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_wavefront_one_as_acquire_acquire_cmpxchg: @@ -14836,7 +15188,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_acquire_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_release_acquire_cmpxchg: @@ -14919,7 +15273,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_release_acquire_cmpxchg: @@ -14935,7 +15291,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_release_acquire_cmpxchg: @@ -14965,7 +15323,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_wavefront_one_as_release_acquire_cmpxchg: @@ -15074,7 +15434,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_acquire_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_acq_rel_acquire_cmpxchg: @@ -15157,7 +15519,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_acquire_cmpxchg: @@ -15173,7 +15537,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_acquire_cmpxchg: @@ -15203,7 +15569,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_acquire_cmpxchg: @@ -15312,7 +15680,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_acquire_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_seq_cst_acquire_cmpxchg: @@ -15395,7 +15765,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_acquire_cmpxchg: @@ -15411,7 +15783,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_acquire_cmpxchg: @@ -15441,7 +15815,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_acquire_cmpxchg: @@ -15550,7 +15926,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_monotonic_seq_cst_cmpxchg: @@ -15633,7 +16011,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_monotonic_seq_cst_cmpxchg: @@ -15649,7 +16029,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_monotonic_seq_cst_cmpxchg: @@ -15679,7 +16061,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_wavefront_one_as_monotonic_seq_cst_cmpxchg: @@ -15788,7 +16172,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_acquire_seq_cst_cmpxchg: @@ -15871,7 +16257,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acquire_seq_cst_cmpxchg: @@ -15887,7 +16275,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acquire_seq_cst_cmpxchg: @@ -15917,7 +16307,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_wavefront_one_as_acquire_seq_cst_cmpxchg: @@ -16026,7 +16418,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_release_seq_cst_cmpxchg: @@ -16109,7 +16503,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_release_seq_cst_cmpxchg: @@ -16125,7 +16521,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_release_seq_cst_cmpxchg: @@ -16155,7 +16553,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_wavefront_one_as_release_seq_cst_cmpxchg: @@ -16264,7 +16664,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg: @@ -16347,7 +16749,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg: @@ -16363,7 +16767,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg: @@ -16393,7 +16799,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg: @@ -16502,7 +16910,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg: @@ -16585,7 +16995,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg: @@ -16601,7 +17013,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg: @@ -16631,7 +17045,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg: @@ -17025,9 +17441,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -17120,9 +17537,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -17140,8 +17558,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -17176,8 +17595,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -17308,10 +17728,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -17403,10 +17825,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -17423,9 +17847,11 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -17459,9 +17885,11 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -17592,10 +18020,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -17687,10 +18117,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -17707,9 +18139,11 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -17743,9 +18177,11 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -17877,9 +18313,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_acquire_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -17972,9 +18409,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -17992,8 +18430,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -18028,8 +18467,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -18161,9 +18601,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_acquire_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -18256,9 +18697,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -18276,8 +18718,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -18312,8 +18755,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -18444,10 +18888,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_acquire_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -18539,10 +18985,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -18559,9 +19007,11 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -18595,9 +19045,11 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -18728,10 +19180,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -18823,10 +19277,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -18843,9 +19299,11 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -18879,9 +19337,11 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -19012,10 +19472,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -19107,10 +19569,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -19127,9 +19591,11 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -19163,9 +19629,11 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -19296,10 +19764,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -19391,10 +19861,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -19411,9 +19883,11 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -19447,9 +19921,11 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -19580,10 +20056,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -19675,10 +20153,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -19695,9 +20175,11 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -19731,9 +20213,11 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -19864,10 +20348,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_seq_cst_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -19959,10 +20445,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -19979,9 +20467,11 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -20015,9 +20505,11 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -20148,10 +20640,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_relc_seq_cst_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -20243,10 +20737,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_relc_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -20263,9 +20759,11 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_relc_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -20299,9 +20797,11 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_relc_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -20432,10 +20932,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -20527,10 +21029,12 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -20547,9 +21051,11 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -20583,9 +21089,11 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-workgroup.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-workgroup.ll index 0fd4aa4a7a93f..d9729611a5fc1 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-workgroup.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-workgroup.ll @@ -388,10 +388,9 @@ define amdgpu_kernel void @flat_workgroup_acquire_load( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: flat_load_dword v2, v[0:1] -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -441,10 +440,9 @@ define amdgpu_kernel void @flat_workgroup_acquire_load( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: flat_load_dword v2, v[0:1] -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -457,9 +455,8 @@ define amdgpu_kernel void @flat_workgroup_acquire_load( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -485,9 +482,8 @@ define amdgpu_kernel void @flat_workgroup_acquire_load( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX942-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] sc0 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -581,12 +577,11 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_load( ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: flat_load_dword v2, v[0:1] -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -638,12 +633,11 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_load( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_load_dword v2, v[0:1] -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -655,11 +649,10 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_load( ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -685,11 +678,10 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_load( ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] sc0 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -1093,7 +1085,7 @@ define amdgpu_kernel void @flat_workgroup_release_store( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -1138,7 +1130,7 @@ define amdgpu_kernel void @flat_workgroup_release_store( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -1151,7 +1143,7 @@ define amdgpu_kernel void @flat_workgroup_release_store( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -1175,7 +1167,7 @@ define amdgpu_kernel void @flat_workgroup_release_store( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -1259,7 +1251,7 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_store( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -1304,7 +1296,7 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_store( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -1317,7 +1309,7 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_store( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -1341,7 +1333,7 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_store( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -1575,7 +1567,7 @@ define amdgpu_kernel void @flat_workgroup_acquire_atomicrmw( ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: flat_atomic_swap v[0:1], v2 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_workgroup_acquire_atomicrmw: @@ -1621,7 +1613,7 @@ define amdgpu_kernel void @flat_workgroup_acquire_atomicrmw( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acquire_atomicrmw: @@ -1634,7 +1626,7 @@ define amdgpu_kernel void @flat_workgroup_acquire_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_acquire_atomicrmw: @@ -1659,7 +1651,7 @@ define amdgpu_kernel void @flat_workgroup_acquire_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_workgroup_acquire_atomicrmw: @@ -1742,7 +1734,7 @@ define amdgpu_kernel void @flat_workgroup_release_atomicrmw( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: flat_atomic_swap v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -1787,7 +1779,7 @@ define amdgpu_kernel void @flat_workgroup_release_atomicrmw( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -1800,7 +1792,7 @@ define amdgpu_kernel void @flat_workgroup_release_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -1824,7 +1816,7 @@ define amdgpu_kernel void @flat_workgroup_release_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -1908,9 +1900,9 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_atomicrmw( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: flat_atomic_swap v[0:1], v2 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_workgroup_acq_rel_atomicrmw: @@ -1958,9 +1950,9 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_atomicrmw( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acq_rel_atomicrmw: @@ -1972,9 +1964,9 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_acq_rel_atomicrmw: @@ -1999,9 +1991,9 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_workgroup_acq_rel_atomicrmw: @@ -2093,9 +2085,9 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_atomicrmw( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: flat_atomic_swap v[0:1], v2 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_workgroup_seq_cst_atomicrmw: @@ -2143,9 +2135,9 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_atomicrmw( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_atomicrmw: @@ -2157,9 +2149,9 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_seq_cst_atomicrmw: @@ -2184,9 +2176,9 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_workgroup_seq_cst_atomicrmw: @@ -2279,10 +2271,9 @@ define amdgpu_kernel void @flat_workgroup_acquire_ret_atomicrmw( ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s6 ; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -2335,10 +2326,9 @@ define amdgpu_kernel void @flat_workgroup_acquire_ret_atomicrmw( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 ; SKIP-CACHE-INV-NEXT: flat_atomic_swap v2, v[0:1], v2 glc -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -2352,9 +2342,8 @@ define amdgpu_kernel void @flat_workgroup_acquire_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s6 ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -2382,9 +2371,8 @@ define amdgpu_kernel void @flat_workgroup_acquire_ret_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 sc0 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -2484,12 +2472,11 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_ret_atomicrmw( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s6 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -2544,12 +2531,11 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_ret_atomicrmw( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_swap v2, v[0:1], v2 glc -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -2562,11 +2548,10 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s6 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -2594,11 +2579,10 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_ret_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 sc0 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -2709,12 +2693,11 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_ret_atomicrmw( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s6 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -2769,12 +2752,11 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_ret_atomicrmw( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_swap v2, v[0:1], v2 glc -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -2787,11 +2769,10 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s6 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -2819,11 +2800,10 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_ret_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 sc0 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -3187,7 +3167,7 @@ define amdgpu_kernel void @flat_workgroup_acquire_monotonic_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_workgroup_acquire_monotonic_cmpxchg: @@ -3275,7 +3255,7 @@ define amdgpu_kernel void @flat_workgroup_acquire_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acquire_monotonic_cmpxchg: @@ -3292,7 +3272,7 @@ define amdgpu_kernel void @flat_workgroup_acquire_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_acquire_monotonic_cmpxchg: @@ -3325,7 +3305,7 @@ define amdgpu_kernel void @flat_workgroup_acquire_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_workgroup_acquire_monotonic_cmpxchg: @@ -3443,7 +3423,7 @@ define amdgpu_kernel void @flat_workgroup_release_monotonic_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm ; @@ -3530,7 +3510,7 @@ define amdgpu_kernel void @flat_workgroup_release_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -3547,7 +3527,7 @@ define amdgpu_kernel void @flat_workgroup_release_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -3579,7 +3559,7 @@ define amdgpu_kernel void @flat_workgroup_release_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -3698,9 +3678,9 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_workgroup_acq_rel_monotonic_cmpxchg: @@ -3790,9 +3770,9 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acq_rel_monotonic_cmpxchg: @@ -3808,9 +3788,9 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_acq_rel_monotonic_cmpxchg: @@ -3843,9 +3823,9 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_workgroup_acq_rel_monotonic_cmpxchg: @@ -3972,9 +3952,9 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_workgroup_seq_cst_monotonic_cmpxchg: @@ -4064,9 +4044,9 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_monotonic_cmpxchg: @@ -4082,9 +4062,9 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_seq_cst_monotonic_cmpxchg: @@ -4117,9 +4097,9 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_workgroup_seq_cst_monotonic_cmpxchg: @@ -4247,7 +4227,7 @@ define amdgpu_kernel void @flat_workgroup_monotonic_acquire_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_workgroup_monotonic_acquire_cmpxchg: @@ -4335,7 +4315,7 @@ define amdgpu_kernel void @flat_workgroup_monotonic_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_monotonic_acquire_cmpxchg: @@ -4352,7 +4332,7 @@ define amdgpu_kernel void @flat_workgroup_monotonic_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_monotonic_acquire_cmpxchg: @@ -4385,7 +4365,7 @@ define amdgpu_kernel void @flat_workgroup_monotonic_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_workgroup_monotonic_acquire_cmpxchg: @@ -4504,7 +4484,7 @@ define amdgpu_kernel void @flat_workgroup_acquire_acquire_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_workgroup_acquire_acquire_cmpxchg: @@ -4592,7 +4572,7 @@ define amdgpu_kernel void @flat_workgroup_acquire_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acquire_acquire_cmpxchg: @@ -4609,7 +4589,7 @@ define amdgpu_kernel void @flat_workgroup_acquire_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_acquire_acquire_cmpxchg: @@ -4642,7 +4622,7 @@ define amdgpu_kernel void @flat_workgroup_acquire_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_workgroup_acquire_acquire_cmpxchg: @@ -4760,9 +4740,9 @@ define amdgpu_kernel void @flat_workgroup_release_acquire_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_workgroup_release_acquire_cmpxchg: @@ -4852,9 +4832,9 @@ define amdgpu_kernel void @flat_workgroup_release_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_release_acquire_cmpxchg: @@ -4870,9 +4850,9 @@ define amdgpu_kernel void @flat_workgroup_release_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_release_acquire_cmpxchg: @@ -4905,9 +4885,9 @@ define amdgpu_kernel void @flat_workgroup_release_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_workgroup_release_acquire_cmpxchg: @@ -5034,9 +5014,9 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_workgroup_acq_rel_acquire_cmpxchg: @@ -5126,9 +5106,9 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acq_rel_acquire_cmpxchg: @@ -5144,9 +5124,9 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_acq_rel_acquire_cmpxchg: @@ -5179,9 +5159,9 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_workgroup_acq_rel_acquire_cmpxchg: @@ -5308,9 +5288,9 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_workgroup_seq_cst_acquire_cmpxchg: @@ -5400,9 +5380,9 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_acquire_cmpxchg: @@ -5418,9 +5398,9 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_seq_cst_acquire_cmpxchg: @@ -5453,9 +5433,9 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_workgroup_seq_cst_acquire_cmpxchg: @@ -5582,9 +5562,9 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_workgroup_seq_cst_seq_cst_cmpxchg: @@ -5674,9 +5654,9 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_seq_cst_cmpxchg: @@ -5692,9 +5672,9 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_seq_cst_seq_cst_cmpxchg: @@ -5727,9 +5707,9 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_workgroup_seq_cst_seq_cst_cmpxchg: @@ -6141,10 +6121,9 @@ define amdgpu_kernel void @flat_workgroup_acquire_monotonic_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -6239,10 +6218,9 @@ define amdgpu_kernel void @flat_workgroup_acquire_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -6260,9 +6238,8 @@ define amdgpu_kernel void @flat_workgroup_acquire_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -6298,9 +6275,8 @@ define amdgpu_kernel void @flat_workgroup_acquire_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -6436,7 +6412,7 @@ define amdgpu_kernel void @flat_workgroup_release_monotonic_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -6535,7 +6511,7 @@ define amdgpu_kernel void @flat_workgroup_release_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 @@ -6556,7 +6532,7 @@ define amdgpu_kernel void @flat_workgroup_release_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -6594,7 +6570,7 @@ define amdgpu_kernel void @flat_workgroup_release_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -6737,12 +6713,11 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -6839,12 +6814,11 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -6861,11 +6835,10 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -6901,11 +6874,10 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -7052,12 +7024,11 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -7154,12 +7125,11 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -7176,11 +7146,10 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -7216,11 +7185,10 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -7368,10 +7336,9 @@ define amdgpu_kernel void @flat_workgroup_monotonic_acquire_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -7466,10 +7433,9 @@ define amdgpu_kernel void @flat_workgroup_monotonic_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -7487,9 +7453,8 @@ define amdgpu_kernel void @flat_workgroup_monotonic_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -7525,9 +7490,8 @@ define amdgpu_kernel void @flat_workgroup_monotonic_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -7666,10 +7630,9 @@ define amdgpu_kernel void @flat_workgroup_acquire_acquire_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -7764,10 +7727,9 @@ define amdgpu_kernel void @flat_workgroup_acquire_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -7785,9 +7747,8 @@ define amdgpu_kernel void @flat_workgroup_acquire_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -7823,9 +7784,8 @@ define amdgpu_kernel void @flat_workgroup_acquire_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -7961,12 +7921,11 @@ define amdgpu_kernel void @flat_workgroup_release_acquire_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -8063,12 +8022,11 @@ define amdgpu_kernel void @flat_workgroup_release_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -8085,11 +8043,10 @@ define amdgpu_kernel void @flat_workgroup_release_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -8125,11 +8082,10 @@ define amdgpu_kernel void @flat_workgroup_release_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -8276,12 +8232,11 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -8378,12 +8333,11 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -8400,11 +8354,10 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -8440,11 +8393,10 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -8591,12 +8543,11 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -8693,12 +8644,11 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -8715,11 +8665,10 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -8755,11 +8704,10 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -8906,12 +8854,11 @@ define amdgpu_kernel void @flat_workgroup_monotonic_seq_cst_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -9008,12 +8955,11 @@ define amdgpu_kernel void @flat_workgroup_monotonic_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -9030,11 +8976,10 @@ define amdgpu_kernel void @flat_workgroup_monotonic_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -9070,11 +9015,10 @@ define amdgpu_kernel void @flat_workgroup_monotonic_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -9221,12 +9165,11 @@ define amdgpu_kernel void @flat_workgroup_acquire_seq_cst_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -9323,12 +9266,11 @@ define amdgpu_kernel void @flat_workgroup_acquire_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -9345,11 +9287,10 @@ define amdgpu_kernel void @flat_workgroup_acquire_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -9385,11 +9326,10 @@ define amdgpu_kernel void @flat_workgroup_acquire_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -9534,12 +9474,11 @@ define amdgpu_kernel void @flat_workgroup_release_seq_cst_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -9636,12 +9575,11 @@ define amdgpu_kernel void @flat_workgroup_release_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -9658,11 +9596,10 @@ define amdgpu_kernel void @flat_workgroup_release_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -9698,11 +9635,10 @@ define amdgpu_kernel void @flat_workgroup_release_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -9849,12 +9785,11 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_seq_cst_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -9951,12 +9886,11 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -9973,11 +9907,10 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -10013,11 +9946,10 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -10164,12 +10096,11 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -10266,12 +10197,11 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -10288,11 +10218,10 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -10328,11 +10257,10 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -10829,9 +10757,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_load( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: flat_load_dword v2, v[0:1] +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -10881,9 +10810,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_load( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: flat_load_dword v2, v[0:1] +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -10896,8 +10826,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_load( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -10923,8 +10854,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_load( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX942-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -11018,10 +10950,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_load( ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_load_dword v2, v[0:1] +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -11072,10 +11006,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_load( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_load_dword v2, v[0:1] +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -11087,9 +11023,11 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_load( ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -11115,9 +11053,11 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_load( ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -11519,6 +11459,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_store( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -11562,6 +11503,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_store( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -11574,6 +11516,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_store( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -11597,6 +11540,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_store( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -11678,6 +11622,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_store( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -11721,6 +11666,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_store( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -11733,6 +11679,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_store( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -11756,6 +11703,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_store( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -11987,6 +11935,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_atomicrmw( ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_acquire_atomicrmw: @@ -12030,6 +11979,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_atomicrmw( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acquire_atomicrmw: @@ -12042,6 +11992,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_atomicrmw: @@ -12066,6 +12017,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_atomicrmw: @@ -12145,6 +12097,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_atomicrmw( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_swap v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -12188,6 +12141,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_atomicrmw( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -12200,6 +12154,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -12223,6 +12178,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -12304,7 +12260,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_atomicrmw( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_acq_rel_atomicrmw: @@ -12349,7 +12307,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_atomicrmw( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_atomicrmw: @@ -12361,7 +12321,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_atomicrmw: @@ -12386,7 +12348,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_atomicrmw: @@ -12473,7 +12437,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_atomicrmw( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_seq_cst_atomicrmw: @@ -12518,7 +12484,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_atomicrmw( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_atomicrmw: @@ -12530,7 +12498,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_atomicrmw: @@ -12555,7 +12525,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_atomicrmw: @@ -12643,9 +12615,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_ret_atomicrmw( ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s6 ; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -12698,9 +12671,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_ret_atomicrmw( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 ; SKIP-CACHE-INV-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -12714,8 +12688,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s6 ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -12743,8 +12718,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_ret_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -12844,10 +12820,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_ret_atomicrmw( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s6 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -12901,10 +12879,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_ret_atomicrmw( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -12917,9 +12897,11 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -12947,9 +12929,11 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_ret_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -13058,10 +13042,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_ret_atomicrmw( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s6 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -13115,10 +13101,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_ret_atomicrmw( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -13131,9 +13119,11 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -13161,9 +13151,11 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_ret_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -13525,6 +13517,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_acquire_monotonic_cmpxchg: @@ -13610,6 +13603,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acquire_monotonic_cmpxchg: @@ -13626,6 +13620,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_monotonic_cmpxchg: @@ -13658,6 +13653,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_monotonic_cmpxchg: @@ -13772,6 +13768,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm ; @@ -13857,6 +13854,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -13873,6 +13871,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -13904,6 +13903,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -14020,7 +14020,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_acq_rel_monotonic_cmpxchg: @@ -14107,7 +14109,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_monotonic_cmpxchg: @@ -14123,7 +14127,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_monotonic_cmpxchg: @@ -14156,7 +14162,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_monotonic_cmpxchg: @@ -14278,7 +14286,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_seq_cst_monotonic_cmpxchg: @@ -14365,7 +14375,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_monotonic_cmpxchg: @@ -14381,7 +14393,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_monotonic_cmpxchg: @@ -14414,7 +14428,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_monotonic_cmpxchg: @@ -14537,6 +14553,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_acquire_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_monotonic_acquire_cmpxchg: @@ -14622,6 +14639,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_monotonic_acquire_cmpxchg: @@ -14638,6 +14656,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_monotonic_acquire_cmpxchg: @@ -14670,6 +14689,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_workgroup_one_as_monotonic_acquire_cmpxchg: @@ -14785,6 +14805,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_acquire_acquire_cmpxchg: @@ -14870,6 +14891,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acquire_acquire_cmpxchg: @@ -14886,6 +14908,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_acquire_cmpxchg: @@ -14918,6 +14941,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_acquire_cmpxchg: @@ -15032,7 +15056,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_release_acquire_cmpxchg: @@ -15119,7 +15145,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_release_acquire_cmpxchg: @@ -15135,7 +15163,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_release_acquire_cmpxchg: @@ -15168,7 +15198,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_workgroup_one_as_release_acquire_cmpxchg: @@ -15290,7 +15322,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_acq_rel_acquire_cmpxchg: @@ -15377,7 +15411,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_acquire_cmpxchg: @@ -15393,7 +15429,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_acquire_cmpxchg: @@ -15426,7 +15464,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_acquire_cmpxchg: @@ -15548,7 +15588,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_seq_cst_acquire_cmpxchg: @@ -15635,7 +15677,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_acquire_cmpxchg: @@ -15651,7 +15695,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_acquire_cmpxchg: @@ -15684,7 +15730,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_acquire_cmpxchg: @@ -15806,7 +15854,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_monotonic_seq_cst_cmpxchg: @@ -15893,7 +15943,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_monotonic_seq_cst_cmpxchg: @@ -15909,7 +15961,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_monotonic_seq_cst_cmpxchg: @@ -15942,7 +15996,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_workgroup_one_as_monotonic_seq_cst_cmpxchg: @@ -16064,7 +16120,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_acquire_seq_cst_cmpxchg: @@ -16151,7 +16209,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acquire_seq_cst_cmpxchg: @@ -16167,7 +16227,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_seq_cst_cmpxchg: @@ -16200,7 +16262,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_seq_cst_cmpxchg: @@ -16322,7 +16386,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_release_seq_cst_cmpxchg: @@ -16409,7 +16475,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_release_seq_cst_cmpxchg: @@ -16425,7 +16493,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_release_seq_cst_cmpxchg: @@ -16458,7 +16528,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_workgroup_one_as_release_seq_cst_cmpxchg: @@ -16580,7 +16652,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg: @@ -16667,7 +16741,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg: @@ -16683,7 +16759,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg: @@ -16716,7 +16794,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg: @@ -16838,7 +16918,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg: @@ -16925,7 +17007,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg: @@ -16941,7 +17025,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg: @@ -16974,7 +17060,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg: @@ -17381,9 +17469,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -17478,9 +17567,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -17498,8 +17588,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -17535,8 +17626,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -17672,6 +17764,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -17769,6 +17862,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 @@ -17789,6 +17883,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -17826,6 +17921,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -17966,10 +18062,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -18065,10 +18163,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -18085,9 +18185,11 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -18123,9 +18225,11 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -18270,10 +18374,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -18369,10 +18475,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -18389,9 +18497,11 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -18427,9 +18537,11 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -18575,9 +18687,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -18672,9 +18785,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -18692,8 +18806,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -18729,8 +18844,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -18869,9 +18985,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -18966,9 +19083,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -18986,8 +19104,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -19023,8 +19142,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -19160,10 +19280,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -19259,10 +19381,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -19279,9 +19403,11 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -19317,9 +19443,11 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -19464,10 +19592,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -19563,10 +19693,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -19583,9 +19715,11 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -19621,9 +19755,11 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -19768,10 +19904,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -19867,10 +20005,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -19887,9 +20027,11 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -19925,9 +20067,11 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -20072,10 +20216,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -20171,10 +20317,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -20191,9 +20339,11 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -20229,9 +20379,11 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -20376,10 +20528,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -20475,10 +20629,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -20495,9 +20651,11 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -20533,9 +20691,11 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -20678,10 +20838,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -20777,10 +20939,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -20797,9 +20961,11 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -20835,9 +21001,11 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -20982,10 +21150,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -21081,10 +21251,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -21101,9 +21273,11 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -21139,9 +21313,11 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -21286,10 +21462,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -21385,10 +21563,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -21405,9 +21585,11 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -21443,9 +21625,11 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-singlethread.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-singlethread.ll index 8042d38716107..61ccefd35ec16 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-singlethread.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-singlethread.ll @@ -586,6 +586,7 @@ define amdgpu_kernel void @global_singlethread_seq_cst_load( ; GFX6-NEXT: s_mov_b32 s5, s14 ; GFX6-NEXT: s_mov_b32 s6, s13 ; GFX6-NEXT: s_mov_b32 s7, s12 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_load_dword v0, off, s[8:11], 0 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 @@ -601,6 +602,7 @@ define amdgpu_kernel void @global_singlethread_seq_cst_load( ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_load_dword v2, v[0:1] ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -650,6 +652,7 @@ define amdgpu_kernel void @global_singlethread_seq_cst_load( ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s10 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s9 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s8 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -660,7 +663,7 @@ define amdgpu_kernel void @global_singlethread_seq_cst_load( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] @@ -682,7 +685,7 @@ define amdgpu_kernel void @global_singlethread_seq_cst_load( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[2:3] ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] @@ -1077,6 +1080,7 @@ define amdgpu_kernel void @global_singlethread_release_store( ; GFX6-NEXT: s_mov_b32 s6, s10 ; GFX6-NEXT: s_mov_b32 s7, s9 ; GFX6-NEXT: v_mov_b32_e32 v0, s8 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; @@ -1091,6 +1095,7 @@ define amdgpu_kernel void @global_singlethread_release_store( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -1129,6 +1134,7 @@ define amdgpu_kernel void @global_singlethread_release_store( ; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s6 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s5 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -1139,6 +1145,7 @@ define amdgpu_kernel void @global_singlethread_release_store( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -1159,6 +1166,7 @@ define amdgpu_kernel void @global_singlethread_release_store( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -1233,6 +1241,7 @@ define amdgpu_kernel void @global_singlethread_seq_cst_store( ; GFX6-NEXT: s_mov_b32 s6, s10 ; GFX6-NEXT: s_mov_b32 s7, s9 ; GFX6-NEXT: v_mov_b32_e32 v0, s8 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; @@ -1247,6 +1256,7 @@ define amdgpu_kernel void @global_singlethread_seq_cst_store( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -1285,6 +1295,7 @@ define amdgpu_kernel void @global_singlethread_seq_cst_store( ; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s6 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s5 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -1295,6 +1306,7 @@ define amdgpu_kernel void @global_singlethread_seq_cst_store( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -1315,6 +1327,7 @@ define amdgpu_kernel void @global_singlethread_seq_cst_store( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -1696,6 +1709,7 @@ define amdgpu_kernel void @global_singlethread_release_atomicrmw( ; GFX6-NEXT: s_mov_b32 s6, s10 ; GFX6-NEXT: s_mov_b32 s7, s9 ; GFX6-NEXT: v_mov_b32_e32 v0, s8 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; @@ -1710,6 +1724,7 @@ define amdgpu_kernel void @global_singlethread_release_atomicrmw( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_swap v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -1747,6 +1762,7 @@ define amdgpu_kernel void @global_singlethread_release_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s6 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s5 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -1757,6 +1773,7 @@ define amdgpu_kernel void @global_singlethread_release_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[4:5] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -1777,6 +1794,7 @@ define amdgpu_kernel void @global_singlethread_release_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -1850,6 +1868,7 @@ define amdgpu_kernel void @global_singlethread_acq_rel_atomicrmw( ; GFX6-NEXT: s_mov_b32 s6, s10 ; GFX6-NEXT: s_mov_b32 s7, s9 ; GFX6-NEXT: v_mov_b32_e32 v0, s8 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; @@ -1864,6 +1883,7 @@ define amdgpu_kernel void @global_singlethread_acq_rel_atomicrmw( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_swap v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -1901,6 +1921,7 @@ define amdgpu_kernel void @global_singlethread_acq_rel_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s6 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s5 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -1911,6 +1932,7 @@ define amdgpu_kernel void @global_singlethread_acq_rel_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[4:5] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -1931,6 +1953,7 @@ define amdgpu_kernel void @global_singlethread_acq_rel_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -2004,6 +2027,7 @@ define amdgpu_kernel void @global_singlethread_seq_cst_atomicrmw( ; GFX6-NEXT: s_mov_b32 s6, s10 ; GFX6-NEXT: s_mov_b32 s7, s9 ; GFX6-NEXT: v_mov_b32_e32 v0, s8 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; @@ -2018,6 +2042,7 @@ define amdgpu_kernel void @global_singlethread_seq_cst_atomicrmw( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_swap v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -2055,6 +2080,7 @@ define amdgpu_kernel void @global_singlethread_seq_cst_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s6 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s5 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -2065,6 +2091,7 @@ define amdgpu_kernel void @global_singlethread_seq_cst_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[4:5] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -2085,6 +2112,7 @@ define amdgpu_kernel void @global_singlethread_seq_cst_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -2341,6 +2369,7 @@ define amdgpu_kernel void @global_singlethread_acq_rel_ret_atomicrmw( ; GFX6-NEXT: s_mov_b32 s6, s10 ; GFX6-NEXT: s_mov_b32 s7, s9 ; GFX6-NEXT: v_mov_b32_e32 v0, s8 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 @@ -2357,6 +2386,7 @@ define amdgpu_kernel void @global_singlethread_acq_rel_ret_atomicrmw( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s6 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -2402,6 +2432,7 @@ define amdgpu_kernel void @global_singlethread_acq_rel_ret_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s6 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s5 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -2414,6 +2445,7 @@ define amdgpu_kernel void @global_singlethread_acq_rel_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[4:5] glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] @@ -2438,6 +2470,7 @@ define amdgpu_kernel void @global_singlethread_acq_rel_ret_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[0:1] sc0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] @@ -2524,6 +2557,7 @@ define amdgpu_kernel void @global_singlethread_seq_cst_ret_atomicrmw( ; GFX6-NEXT: s_mov_b32 s6, s10 ; GFX6-NEXT: s_mov_b32 s7, s9 ; GFX6-NEXT: v_mov_b32_e32 v0, s8 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 @@ -2540,6 +2574,7 @@ define amdgpu_kernel void @global_singlethread_seq_cst_ret_atomicrmw( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s6 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -2585,6 +2620,7 @@ define amdgpu_kernel void @global_singlethread_seq_cst_ret_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s6 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s5 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -2597,6 +2633,7 @@ define amdgpu_kernel void @global_singlethread_seq_cst_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[4:5] glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] @@ -2621,6 +2658,7 @@ define amdgpu_kernel void @global_singlethread_seq_cst_ret_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[0:1] sc0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] @@ -3150,6 +3188,7 @@ define amdgpu_kernel void @global_singlethread_release_monotonic_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; GFX6-NEXT: s_endpgm ; @@ -3178,6 +3217,7 @@ define amdgpu_kernel void @global_singlethread_release_monotonic_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm ; @@ -3228,6 +3268,7 @@ define amdgpu_kernel void @global_singlethread_release_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -3242,6 +3283,7 @@ define amdgpu_kernel void @global_singlethread_release_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -3270,6 +3312,7 @@ define amdgpu_kernel void @global_singlethread_release_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -3369,6 +3412,7 @@ define amdgpu_kernel void @global_singlethread_acq_rel_monotonic_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; GFX6-NEXT: s_endpgm ; @@ -3397,6 +3441,7 @@ define amdgpu_kernel void @global_singlethread_acq_rel_monotonic_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm ; @@ -3447,6 +3492,7 @@ define amdgpu_kernel void @global_singlethread_acq_rel_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -3461,6 +3507,7 @@ define amdgpu_kernel void @global_singlethread_acq_rel_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -3489,6 +3536,7 @@ define amdgpu_kernel void @global_singlethread_acq_rel_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -3588,6 +3636,7 @@ define amdgpu_kernel void @global_singlethread_seq_cst_monotonic_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; GFX6-NEXT: s_endpgm ; @@ -3616,6 +3665,7 @@ define amdgpu_kernel void @global_singlethread_seq_cst_monotonic_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm ; @@ -3666,6 +3716,7 @@ define amdgpu_kernel void @global_singlethread_seq_cst_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -3680,6 +3731,7 @@ define amdgpu_kernel void @global_singlethread_seq_cst_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -3708,6 +3760,7 @@ define amdgpu_kernel void @global_singlethread_seq_cst_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -4245,6 +4298,7 @@ define amdgpu_kernel void @global_singlethread_release_acquire_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; GFX6-NEXT: s_endpgm ; @@ -4273,6 +4327,7 @@ define amdgpu_kernel void @global_singlethread_release_acquire_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm ; @@ -4323,6 +4378,7 @@ define amdgpu_kernel void @global_singlethread_release_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -4337,6 +4393,7 @@ define amdgpu_kernel void @global_singlethread_release_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -4365,6 +4422,7 @@ define amdgpu_kernel void @global_singlethread_release_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -4464,6 +4522,7 @@ define amdgpu_kernel void @global_singlethread_acq_rel_acquire_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; GFX6-NEXT: s_endpgm ; @@ -4492,6 +4551,7 @@ define amdgpu_kernel void @global_singlethread_acq_rel_acquire_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm ; @@ -4542,6 +4602,7 @@ define amdgpu_kernel void @global_singlethread_acq_rel_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -4556,6 +4617,7 @@ define amdgpu_kernel void @global_singlethread_acq_rel_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -4584,6 +4646,7 @@ define amdgpu_kernel void @global_singlethread_acq_rel_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -4683,6 +4746,7 @@ define amdgpu_kernel void @global_singlethread_seq_cst_acquire_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; GFX6-NEXT: s_endpgm ; @@ -4711,6 +4775,7 @@ define amdgpu_kernel void @global_singlethread_seq_cst_acquire_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm ; @@ -4761,6 +4826,7 @@ define amdgpu_kernel void @global_singlethread_seq_cst_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -4775,6 +4841,7 @@ define amdgpu_kernel void @global_singlethread_seq_cst_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -4803,6 +4870,7 @@ define amdgpu_kernel void @global_singlethread_seq_cst_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -4902,6 +4970,7 @@ define amdgpu_kernel void @global_singlethread_monotonic_seq_cst_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; GFX6-NEXT: s_endpgm ; @@ -4930,6 +4999,7 @@ define amdgpu_kernel void @global_singlethread_monotonic_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm ; @@ -4980,6 +5050,7 @@ define amdgpu_kernel void @global_singlethread_monotonic_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -4994,6 +5065,7 @@ define amdgpu_kernel void @global_singlethread_monotonic_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -5022,6 +5094,7 @@ define amdgpu_kernel void @global_singlethread_monotonic_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -5121,6 +5194,7 @@ define amdgpu_kernel void @global_singlethread_acquire_seq_cst_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; GFX6-NEXT: s_endpgm ; @@ -5149,6 +5223,7 @@ define amdgpu_kernel void @global_singlethread_acquire_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm ; @@ -5199,6 +5274,7 @@ define amdgpu_kernel void @global_singlethread_acquire_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -5213,6 +5289,7 @@ define amdgpu_kernel void @global_singlethread_acquire_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -5241,6 +5318,7 @@ define amdgpu_kernel void @global_singlethread_acquire_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -5340,6 +5418,7 @@ define amdgpu_kernel void @global_singlethread_release_seq_cst_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; GFX6-NEXT: s_endpgm ; @@ -5368,6 +5447,7 @@ define amdgpu_kernel void @global_singlethread_release_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm ; @@ -5418,6 +5498,7 @@ define amdgpu_kernel void @global_singlethread_release_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -5432,6 +5513,7 @@ define amdgpu_kernel void @global_singlethread_release_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -5460,6 +5542,7 @@ define amdgpu_kernel void @global_singlethread_release_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -5559,6 +5642,7 @@ define amdgpu_kernel void @global_singlethread_acq_rel_seq_cst_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; GFX6-NEXT: s_endpgm ; @@ -5587,6 +5671,7 @@ define amdgpu_kernel void @global_singlethread_acq_rel_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm ; @@ -5637,6 +5722,7 @@ define amdgpu_kernel void @global_singlethread_acq_rel_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -5651,6 +5737,7 @@ define amdgpu_kernel void @global_singlethread_acq_rel_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -5679,6 +5766,7 @@ define amdgpu_kernel void @global_singlethread_acq_rel_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -5778,6 +5866,7 @@ define amdgpu_kernel void @global_singlethread_seq_cst_seq_cst_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; GFX6-NEXT: s_endpgm ; @@ -5806,6 +5895,7 @@ define amdgpu_kernel void @global_singlethread_seq_cst_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm ; @@ -5856,6 +5946,7 @@ define amdgpu_kernel void @global_singlethread_seq_cst_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -5870,6 +5961,7 @@ define amdgpu_kernel void @global_singlethread_seq_cst_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -5898,6 +5990,7 @@ define amdgpu_kernel void @global_singlethread_seq_cst_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -6499,6 +6592,7 @@ define amdgpu_kernel void @global_singlethread_release_monotonic_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -6530,6 +6624,7 @@ define amdgpu_kernel void @global_singlethread_release_monotonic_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -6588,6 +6683,7 @@ define amdgpu_kernel void @global_singlethread_release_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) @@ -6605,6 +6701,7 @@ define amdgpu_kernel void @global_singlethread_release_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] @@ -6637,6 +6734,7 @@ define amdgpu_kernel void @global_singlethread_release_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] @@ -6750,6 +6848,7 @@ define amdgpu_kernel void @global_singlethread_acq_rel_monotonic_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -6781,6 +6880,7 @@ define amdgpu_kernel void @global_singlethread_acq_rel_monotonic_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -6839,6 +6939,7 @@ define amdgpu_kernel void @global_singlethread_acq_rel_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) @@ -6856,6 +6957,7 @@ define amdgpu_kernel void @global_singlethread_acq_rel_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] @@ -6888,6 +6990,7 @@ define amdgpu_kernel void @global_singlethread_acq_rel_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] @@ -7001,6 +7104,7 @@ define amdgpu_kernel void @global_singlethread_seq_cst_monotonic_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -7032,6 +7136,7 @@ define amdgpu_kernel void @global_singlethread_seq_cst_monotonic_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -7090,6 +7195,7 @@ define amdgpu_kernel void @global_singlethread_seq_cst_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) @@ -7107,6 +7213,7 @@ define amdgpu_kernel void @global_singlethread_seq_cst_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] @@ -7139,6 +7246,7 @@ define amdgpu_kernel void @global_singlethread_seq_cst_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] @@ -7754,6 +7862,7 @@ define amdgpu_kernel void @global_singlethread_release_acquire_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -7785,6 +7894,7 @@ define amdgpu_kernel void @global_singlethread_release_acquire_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -7843,6 +7953,7 @@ define amdgpu_kernel void @global_singlethread_release_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) @@ -7860,6 +7971,7 @@ define amdgpu_kernel void @global_singlethread_release_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] @@ -7892,6 +8004,7 @@ define amdgpu_kernel void @global_singlethread_release_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] @@ -8005,6 +8118,7 @@ define amdgpu_kernel void @global_singlethread_acq_rel_acquire_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -8036,6 +8150,7 @@ define amdgpu_kernel void @global_singlethread_acq_rel_acquire_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -8094,6 +8209,7 @@ define amdgpu_kernel void @global_singlethread_acq_rel_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) @@ -8111,6 +8227,7 @@ define amdgpu_kernel void @global_singlethread_acq_rel_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] @@ -8143,6 +8260,7 @@ define amdgpu_kernel void @global_singlethread_acq_rel_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] @@ -8256,6 +8374,7 @@ define amdgpu_kernel void @global_singlethread_seq_cst_acquire_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -8287,6 +8406,7 @@ define amdgpu_kernel void @global_singlethread_seq_cst_acquire_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -8345,6 +8465,7 @@ define amdgpu_kernel void @global_singlethread_seq_cst_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) @@ -8362,6 +8483,7 @@ define amdgpu_kernel void @global_singlethread_seq_cst_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] @@ -8394,6 +8516,7 @@ define amdgpu_kernel void @global_singlethread_seq_cst_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] @@ -8507,6 +8630,7 @@ define amdgpu_kernel void @global_singlethread_monotonic_seq_cst_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -8538,6 +8662,7 @@ define amdgpu_kernel void @global_singlethread_monotonic_seq_cst_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -8596,6 +8721,7 @@ define amdgpu_kernel void @global_singlethread_monotonic_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) @@ -8613,6 +8739,7 @@ define amdgpu_kernel void @global_singlethread_monotonic_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] @@ -8645,6 +8772,7 @@ define amdgpu_kernel void @global_singlethread_monotonic_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] @@ -8758,6 +8886,7 @@ define amdgpu_kernel void @global_singlethread_acquire_seq_cst_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -8789,6 +8918,7 @@ define amdgpu_kernel void @global_singlethread_acquire_seq_cst_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -8847,6 +8977,7 @@ define amdgpu_kernel void @global_singlethread_acquire_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) @@ -8864,6 +8995,7 @@ define amdgpu_kernel void @global_singlethread_acquire_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] @@ -8896,6 +9028,7 @@ define amdgpu_kernel void @global_singlethread_acquire_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] @@ -9009,6 +9142,7 @@ define amdgpu_kernel void @global_singlethread_release_seq_cst_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -9040,6 +9174,7 @@ define amdgpu_kernel void @global_singlethread_release_seq_cst_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -9098,6 +9233,7 @@ define amdgpu_kernel void @global_singlethread_release_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) @@ -9115,6 +9251,7 @@ define amdgpu_kernel void @global_singlethread_release_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] @@ -9147,6 +9284,7 @@ define amdgpu_kernel void @global_singlethread_release_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] @@ -9260,6 +9398,7 @@ define amdgpu_kernel void @global_singlethread_acq_rel_seq_cst_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -9291,6 +9430,7 @@ define amdgpu_kernel void @global_singlethread_acq_rel_seq_cst_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -9349,6 +9489,7 @@ define amdgpu_kernel void @global_singlethread_acq_rel_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) @@ -9366,6 +9507,7 @@ define amdgpu_kernel void @global_singlethread_acq_rel_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] @@ -9398,6 +9540,7 @@ define amdgpu_kernel void @global_singlethread_acq_rel_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] @@ -9511,6 +9654,7 @@ define amdgpu_kernel void @global_singlethread_seq_cst_seq_cst_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -9542,6 +9686,7 @@ define amdgpu_kernel void @global_singlethread_seq_cst_seq_cst_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -9600,6 +9745,7 @@ define amdgpu_kernel void @global_singlethread_seq_cst_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) @@ -9617,6 +9763,7 @@ define amdgpu_kernel void @global_singlethread_seq_cst_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] @@ -9649,6 +9796,7 @@ define amdgpu_kernel void @global_singlethread_seq_cst_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] @@ -10315,6 +10463,7 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_load( ; GFX6-NEXT: s_mov_b32 s5, s14 ; GFX6-NEXT: s_mov_b32 s6, s13 ; GFX6-NEXT: s_mov_b32 s7, s12 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_load_dword v0, off, s[8:11], 0 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 @@ -10330,6 +10479,7 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_load( ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_load_dword v2, v[0:1] ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -10379,6 +10529,7 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_load( ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s10 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s9 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s8 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -10389,7 +10540,7 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_load( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] @@ -10411,7 +10562,7 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_load( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[2:3] ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] @@ -10806,6 +10957,7 @@ define amdgpu_kernel void @global_singlethread_one_as_release_store( ; GFX6-NEXT: s_mov_b32 s6, s10 ; GFX6-NEXT: s_mov_b32 s7, s9 ; GFX6-NEXT: v_mov_b32_e32 v0, s8 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; @@ -10820,6 +10972,7 @@ define amdgpu_kernel void @global_singlethread_one_as_release_store( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -10858,6 +11011,7 @@ define amdgpu_kernel void @global_singlethread_one_as_release_store( ; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s6 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s5 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -10868,6 +11022,7 @@ define amdgpu_kernel void @global_singlethread_one_as_release_store( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -10888,6 +11043,7 @@ define amdgpu_kernel void @global_singlethread_one_as_release_store( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -10962,6 +11118,7 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_store( ; GFX6-NEXT: s_mov_b32 s6, s10 ; GFX6-NEXT: s_mov_b32 s7, s9 ; GFX6-NEXT: v_mov_b32_e32 v0, s8 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; @@ -10976,6 +11133,7 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_store( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -11014,6 +11172,7 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_store( ; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s6 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s5 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -11024,6 +11183,7 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_store( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -11044,6 +11204,7 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_store( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -11425,6 +11586,7 @@ define amdgpu_kernel void @global_singlethread_one_as_release_atomicrmw( ; GFX6-NEXT: s_mov_b32 s6, s10 ; GFX6-NEXT: s_mov_b32 s7, s9 ; GFX6-NEXT: v_mov_b32_e32 v0, s8 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; @@ -11439,6 +11601,7 @@ define amdgpu_kernel void @global_singlethread_one_as_release_atomicrmw( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_swap v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -11476,6 +11639,7 @@ define amdgpu_kernel void @global_singlethread_one_as_release_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s6 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s5 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -11486,6 +11650,7 @@ define amdgpu_kernel void @global_singlethread_one_as_release_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[4:5] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -11506,6 +11671,7 @@ define amdgpu_kernel void @global_singlethread_one_as_release_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -11579,6 +11745,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_atomicrmw( ; GFX6-NEXT: s_mov_b32 s6, s10 ; GFX6-NEXT: s_mov_b32 s7, s9 ; GFX6-NEXT: v_mov_b32_e32 v0, s8 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; @@ -11593,6 +11760,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_atomicrmw( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_swap v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -11630,6 +11798,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s6 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s5 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -11640,6 +11809,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[4:5] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -11660,6 +11830,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -11733,6 +11904,7 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_atomicrmw( ; GFX6-NEXT: s_mov_b32 s6, s10 ; GFX6-NEXT: s_mov_b32 s7, s9 ; GFX6-NEXT: v_mov_b32_e32 v0, s8 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; @@ -11747,6 +11919,7 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_atomicrmw( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_swap v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -11784,6 +11957,7 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s6 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s5 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -11794,6 +11968,7 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[4:5] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -11814,6 +11989,7 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -12070,6 +12246,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_ret_atomicrmw( ; GFX6-NEXT: s_mov_b32 s6, s10 ; GFX6-NEXT: s_mov_b32 s7, s9 ; GFX6-NEXT: v_mov_b32_e32 v0, s8 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 @@ -12086,6 +12263,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_ret_atomicrmw( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s6 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -12131,6 +12309,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_ret_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s6 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s5 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -12143,6 +12322,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[4:5] glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] @@ -12167,6 +12347,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_ret_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[0:1] sc0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] @@ -12253,6 +12434,7 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_ret_atomicrmw( ; GFX6-NEXT: s_mov_b32 s6, s10 ; GFX6-NEXT: s_mov_b32 s7, s9 ; GFX6-NEXT: v_mov_b32_e32 v0, s8 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 @@ -12269,6 +12451,7 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_ret_atomicrmw( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s6 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -12314,6 +12497,7 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_ret_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s6 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s5 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -12326,6 +12510,7 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[4:5] glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] @@ -12350,6 +12535,7 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_ret_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[0:1] sc0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] @@ -12879,6 +13065,7 @@ define amdgpu_kernel void @global_singlethread_one_as_release_monotonic_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; GFX6-NEXT: s_endpgm ; @@ -12907,6 +13094,7 @@ define amdgpu_kernel void @global_singlethread_one_as_release_monotonic_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm ; @@ -12957,6 +13145,7 @@ define amdgpu_kernel void @global_singlethread_one_as_release_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -12971,6 +13160,7 @@ define amdgpu_kernel void @global_singlethread_one_as_release_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -12999,6 +13189,7 @@ define amdgpu_kernel void @global_singlethread_one_as_release_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -13098,6 +13289,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_monotonic_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; GFX6-NEXT: s_endpgm ; @@ -13126,6 +13318,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_monotonic_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm ; @@ -13176,6 +13369,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -13190,6 +13384,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -13218,6 +13413,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -13317,6 +13513,7 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_monotonic_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; GFX6-NEXT: s_endpgm ; @@ -13345,6 +13542,7 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_monotonic_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm ; @@ -13395,6 +13593,7 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -13409,6 +13608,7 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -13437,6 +13637,7 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -13974,6 +14175,7 @@ define amdgpu_kernel void @global_singlethread_one_as_release_acquire_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; GFX6-NEXT: s_endpgm ; @@ -14002,6 +14204,7 @@ define amdgpu_kernel void @global_singlethread_one_as_release_acquire_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm ; @@ -14052,6 +14255,7 @@ define amdgpu_kernel void @global_singlethread_one_as_release_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -14066,6 +14270,7 @@ define amdgpu_kernel void @global_singlethread_one_as_release_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -14094,6 +14299,7 @@ define amdgpu_kernel void @global_singlethread_one_as_release_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -14193,6 +14399,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_acquire_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; GFX6-NEXT: s_endpgm ; @@ -14221,6 +14428,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_acquire_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm ; @@ -14271,6 +14479,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -14285,6 +14494,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -14313,6 +14523,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -14412,6 +14623,7 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_acquire_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; GFX6-NEXT: s_endpgm ; @@ -14440,6 +14652,7 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_acquire_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm ; @@ -14490,6 +14703,7 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -14504,6 +14718,7 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -14532,6 +14747,7 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -14631,6 +14847,7 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_seq_cst_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; GFX6-NEXT: s_endpgm ; @@ -14659,6 +14876,7 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm ; @@ -14709,6 +14927,7 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -14723,6 +14942,7 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -14751,6 +14971,7 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -14850,6 +15071,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_seq_cst_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; GFX6-NEXT: s_endpgm ; @@ -14878,6 +15100,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm ; @@ -14928,6 +15151,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -14942,6 +15166,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -14970,6 +15195,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -15069,6 +15295,7 @@ define amdgpu_kernel void @global_singlethread_one_as_release_seq_cst_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; GFX6-NEXT: s_endpgm ; @@ -15097,6 +15324,7 @@ define amdgpu_kernel void @global_singlethread_one_as_release_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm ; @@ -15147,6 +15375,7 @@ define amdgpu_kernel void @global_singlethread_one_as_release_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -15161,6 +15390,7 @@ define amdgpu_kernel void @global_singlethread_one_as_release_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -15189,6 +15419,7 @@ define amdgpu_kernel void @global_singlethread_one_as_release_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -15288,6 +15519,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_seq_cst_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; GFX6-NEXT: s_endpgm ; @@ -15316,6 +15548,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm ; @@ -15366,6 +15599,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -15380,6 +15614,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -15408,6 +15643,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -15507,6 +15743,7 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_seq_cst_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; GFX6-NEXT: s_endpgm ; @@ -15535,6 +15772,7 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm ; @@ -15585,6 +15823,7 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -15599,6 +15838,7 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -15627,6 +15867,7 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -16228,6 +16469,7 @@ define amdgpu_kernel void @global_singlethread_one_as_release_monotonic_ret_cmpx ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -16259,6 +16501,7 @@ define amdgpu_kernel void @global_singlethread_one_as_release_monotonic_ret_cmpx ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -16317,6 +16560,7 @@ define amdgpu_kernel void @global_singlethread_one_as_release_monotonic_ret_cmpx ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) @@ -16334,6 +16578,7 @@ define amdgpu_kernel void @global_singlethread_one_as_release_monotonic_ret_cmpx ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] @@ -16366,6 +16611,7 @@ define amdgpu_kernel void @global_singlethread_one_as_release_monotonic_ret_cmpx ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] @@ -16479,6 +16725,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_monotonic_ret_cmpx ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -16510,6 +16757,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_monotonic_ret_cmpx ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -16568,6 +16816,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_monotonic_ret_cmpx ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) @@ -16585,6 +16834,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_monotonic_ret_cmpx ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] @@ -16617,6 +16867,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_monotonic_ret_cmpx ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] @@ -16730,6 +16981,7 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_monotonic_ret_cmpx ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -16761,6 +17013,7 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_monotonic_ret_cmpx ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -16819,6 +17072,7 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_monotonic_ret_cmpx ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) @@ -16836,6 +17090,7 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_monotonic_ret_cmpx ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] @@ -16868,6 +17123,7 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_monotonic_ret_cmpx ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] @@ -17483,6 +17739,7 @@ define amdgpu_kernel void @global_singlethread_one_as_release_acquire_ret_cmpxch ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -17514,6 +17771,7 @@ define amdgpu_kernel void @global_singlethread_one_as_release_acquire_ret_cmpxch ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -17572,6 +17830,7 @@ define amdgpu_kernel void @global_singlethread_one_as_release_acquire_ret_cmpxch ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) @@ -17589,6 +17848,7 @@ define amdgpu_kernel void @global_singlethread_one_as_release_acquire_ret_cmpxch ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] @@ -17621,6 +17881,7 @@ define amdgpu_kernel void @global_singlethread_one_as_release_acquire_ret_cmpxch ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] @@ -17734,6 +17995,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_acquire_ret_cmpxch ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -17765,6 +18027,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_acquire_ret_cmpxch ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -17823,6 +18086,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_acquire_ret_cmpxch ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) @@ -17840,6 +18104,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_acquire_ret_cmpxch ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] @@ -17872,6 +18137,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_acquire_ret_cmpxch ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] @@ -17985,6 +18251,7 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_acquire_ret_cmpxch ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -18016,6 +18283,7 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_acquire_ret_cmpxch ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -18074,6 +18342,7 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_acquire_ret_cmpxch ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) @@ -18091,6 +18360,7 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_acquire_ret_cmpxch ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] @@ -18123,6 +18393,7 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_acquire_ret_cmpxch ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] @@ -18236,6 +18507,7 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_seq_cst_ret_cmpx ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -18267,6 +18539,7 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_seq_cst_ret_cmpx ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -18325,6 +18598,7 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_seq_cst_ret_cmpx ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) @@ -18342,6 +18616,7 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_seq_cst_ret_cmpx ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] @@ -18374,6 +18649,7 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_seq_cst_ret_cmpx ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] @@ -18487,6 +18763,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_seq_cst_ret_cmpxch ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -18518,6 +18795,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_seq_cst_ret_cmpxch ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -18576,6 +18854,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_seq_cst_ret_cmpxch ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) @@ -18593,6 +18872,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_seq_cst_ret_cmpxch ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] @@ -18625,6 +18905,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_seq_cst_ret_cmpxch ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] @@ -18738,6 +19019,7 @@ define amdgpu_kernel void @global_singlethread_one_as_release_seq_cst_ret_cmpxch ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -18769,6 +19051,7 @@ define amdgpu_kernel void @global_singlethread_one_as_release_seq_cst_ret_cmpxch ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -18827,6 +19110,7 @@ define amdgpu_kernel void @global_singlethread_one_as_release_seq_cst_ret_cmpxch ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) @@ -18844,6 +19128,7 @@ define amdgpu_kernel void @global_singlethread_one_as_release_seq_cst_ret_cmpxch ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] @@ -18876,6 +19161,7 @@ define amdgpu_kernel void @global_singlethread_one_as_release_seq_cst_ret_cmpxch ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] @@ -18989,6 +19275,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_seq_cst_ret_cmpxch ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -19020,6 +19307,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_seq_cst_ret_cmpxch ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -19078,6 +19366,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_seq_cst_ret_cmpxch ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) @@ -19095,6 +19384,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_seq_cst_ret_cmpxch ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] @@ -19127,6 +19417,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_seq_cst_ret_cmpxch ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] @@ -19240,6 +19531,7 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_seq_cst_ret_cmpxch ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -19271,6 +19563,7 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_seq_cst_ret_cmpxch ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -19329,6 +19622,7 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_seq_cst_ret_cmpxch ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) @@ -19346,6 +19640,7 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_seq_cst_ret_cmpxch ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] @@ -19378,6 +19673,7 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_seq_cst_ret_cmpxch ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-volatile.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-volatile.ll index 8a5c5dda9f79c..27c2cc6b2d63f 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-volatile.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-volatile.ll @@ -861,7 +861,7 @@ define amdgpu_kernel void @global_volatile_workgroup_release_store( ; GFX6-NEXT: s_mov_b32 s2, s6 ; GFX6-NEXT: s_mov_b32 s3, s5 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; @@ -876,7 +876,7 @@ define amdgpu_kernel void @global_volatile_workgroup_release_store( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -918,7 +918,7 @@ define amdgpu_kernel void @global_volatile_workgroup_release_store( ; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s6 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s5 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-wavefront.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-wavefront.ll index 151ba07a0b531..4d76032f0255d 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-wavefront.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-wavefront.ll @@ -586,6 +586,7 @@ define amdgpu_kernel void @global_wavefront_seq_cst_load( ; GFX6-NEXT: s_mov_b32 s5, s14 ; GFX6-NEXT: s_mov_b32 s6, s13 ; GFX6-NEXT: s_mov_b32 s7, s12 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_load_dword v0, off, s[8:11], 0 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 @@ -601,6 +602,7 @@ define amdgpu_kernel void @global_wavefront_seq_cst_load( ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_load_dword v2, v[0:1] ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -650,6 +652,7 @@ define amdgpu_kernel void @global_wavefront_seq_cst_load( ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s10 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s9 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s8 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -660,7 +663,7 @@ define amdgpu_kernel void @global_wavefront_seq_cst_load( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] @@ -682,7 +685,7 @@ define amdgpu_kernel void @global_wavefront_seq_cst_load( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[2:3] ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] @@ -1077,6 +1080,7 @@ define amdgpu_kernel void @global_wavefront_release_store( ; GFX6-NEXT: s_mov_b32 s6, s10 ; GFX6-NEXT: s_mov_b32 s7, s9 ; GFX6-NEXT: v_mov_b32_e32 v0, s8 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; @@ -1091,6 +1095,7 @@ define amdgpu_kernel void @global_wavefront_release_store( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -1129,6 +1134,7 @@ define amdgpu_kernel void @global_wavefront_release_store( ; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s6 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s5 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -1139,6 +1145,7 @@ define amdgpu_kernel void @global_wavefront_release_store( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -1159,6 +1166,7 @@ define amdgpu_kernel void @global_wavefront_release_store( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -1233,6 +1241,7 @@ define amdgpu_kernel void @global_wavefront_seq_cst_store( ; GFX6-NEXT: s_mov_b32 s6, s10 ; GFX6-NEXT: s_mov_b32 s7, s9 ; GFX6-NEXT: v_mov_b32_e32 v0, s8 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; @@ -1247,6 +1256,7 @@ define amdgpu_kernel void @global_wavefront_seq_cst_store( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -1285,6 +1295,7 @@ define amdgpu_kernel void @global_wavefront_seq_cst_store( ; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s6 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s5 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -1295,6 +1306,7 @@ define amdgpu_kernel void @global_wavefront_seq_cst_store( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -1315,6 +1327,7 @@ define amdgpu_kernel void @global_wavefront_seq_cst_store( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -1696,6 +1709,7 @@ define amdgpu_kernel void @global_wavefront_release_atomicrmw( ; GFX6-NEXT: s_mov_b32 s6, s10 ; GFX6-NEXT: s_mov_b32 s7, s9 ; GFX6-NEXT: v_mov_b32_e32 v0, s8 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; @@ -1710,6 +1724,7 @@ define amdgpu_kernel void @global_wavefront_release_atomicrmw( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_swap v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -1747,6 +1762,7 @@ define amdgpu_kernel void @global_wavefront_release_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s6 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s5 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -1757,6 +1773,7 @@ define amdgpu_kernel void @global_wavefront_release_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[4:5] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -1777,6 +1794,7 @@ define amdgpu_kernel void @global_wavefront_release_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -1850,6 +1868,7 @@ define amdgpu_kernel void @global_wavefront_acq_rel_atomicrmw( ; GFX6-NEXT: s_mov_b32 s6, s10 ; GFX6-NEXT: s_mov_b32 s7, s9 ; GFX6-NEXT: v_mov_b32_e32 v0, s8 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; @@ -1864,6 +1883,7 @@ define amdgpu_kernel void @global_wavefront_acq_rel_atomicrmw( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_swap v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -1901,6 +1921,7 @@ define amdgpu_kernel void @global_wavefront_acq_rel_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s6 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s5 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -1911,6 +1932,7 @@ define amdgpu_kernel void @global_wavefront_acq_rel_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[4:5] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -1931,6 +1953,7 @@ define amdgpu_kernel void @global_wavefront_acq_rel_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -2004,6 +2027,7 @@ define amdgpu_kernel void @global_wavefront_seq_cst_atomicrmw( ; GFX6-NEXT: s_mov_b32 s6, s10 ; GFX6-NEXT: s_mov_b32 s7, s9 ; GFX6-NEXT: v_mov_b32_e32 v0, s8 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; @@ -2018,6 +2042,7 @@ define amdgpu_kernel void @global_wavefront_seq_cst_atomicrmw( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_swap v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -2055,6 +2080,7 @@ define amdgpu_kernel void @global_wavefront_seq_cst_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s6 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s5 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -2065,6 +2091,7 @@ define amdgpu_kernel void @global_wavefront_seq_cst_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[4:5] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -2085,6 +2112,7 @@ define amdgpu_kernel void @global_wavefront_seq_cst_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -2341,6 +2369,7 @@ define amdgpu_kernel void @global_wavefront_acq_rel_ret_atomicrmw( ; GFX6-NEXT: s_mov_b32 s6, s10 ; GFX6-NEXT: s_mov_b32 s7, s9 ; GFX6-NEXT: v_mov_b32_e32 v0, s8 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 @@ -2357,6 +2386,7 @@ define amdgpu_kernel void @global_wavefront_acq_rel_ret_atomicrmw( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s6 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -2402,6 +2432,7 @@ define amdgpu_kernel void @global_wavefront_acq_rel_ret_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s6 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s5 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -2414,6 +2445,7 @@ define amdgpu_kernel void @global_wavefront_acq_rel_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[4:5] glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] @@ -2438,6 +2470,7 @@ define amdgpu_kernel void @global_wavefront_acq_rel_ret_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[0:1] sc0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] @@ -2524,6 +2557,7 @@ define amdgpu_kernel void @global_wavefront_seq_cst_ret_atomicrmw( ; GFX6-NEXT: s_mov_b32 s6, s10 ; GFX6-NEXT: s_mov_b32 s7, s9 ; GFX6-NEXT: v_mov_b32_e32 v0, s8 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 @@ -2540,6 +2574,7 @@ define amdgpu_kernel void @global_wavefront_seq_cst_ret_atomicrmw( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s6 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -2585,6 +2620,7 @@ define amdgpu_kernel void @global_wavefront_seq_cst_ret_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s6 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s5 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -2597,6 +2633,7 @@ define amdgpu_kernel void @global_wavefront_seq_cst_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[4:5] glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] @@ -2621,6 +2658,7 @@ define amdgpu_kernel void @global_wavefront_seq_cst_ret_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[0:1] sc0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] @@ -3150,6 +3188,7 @@ define amdgpu_kernel void @global_wavefront_release_monotonic_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; GFX6-NEXT: s_endpgm ; @@ -3178,6 +3217,7 @@ define amdgpu_kernel void @global_wavefront_release_monotonic_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm ; @@ -3228,6 +3268,7 @@ define amdgpu_kernel void @global_wavefront_release_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -3242,6 +3283,7 @@ define amdgpu_kernel void @global_wavefront_release_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -3270,6 +3312,7 @@ define amdgpu_kernel void @global_wavefront_release_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -3369,6 +3412,7 @@ define amdgpu_kernel void @global_wavefront_acq_rel_monotonic_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; GFX6-NEXT: s_endpgm ; @@ -3397,6 +3441,7 @@ define amdgpu_kernel void @global_wavefront_acq_rel_monotonic_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm ; @@ -3447,6 +3492,7 @@ define amdgpu_kernel void @global_wavefront_acq_rel_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -3461,6 +3507,7 @@ define amdgpu_kernel void @global_wavefront_acq_rel_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -3489,6 +3536,7 @@ define amdgpu_kernel void @global_wavefront_acq_rel_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -3588,6 +3636,7 @@ define amdgpu_kernel void @global_wavefront_seq_cst_monotonic_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; GFX6-NEXT: s_endpgm ; @@ -3616,6 +3665,7 @@ define amdgpu_kernel void @global_wavefront_seq_cst_monotonic_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm ; @@ -3666,6 +3716,7 @@ define amdgpu_kernel void @global_wavefront_seq_cst_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -3680,6 +3731,7 @@ define amdgpu_kernel void @global_wavefront_seq_cst_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -3708,6 +3760,7 @@ define amdgpu_kernel void @global_wavefront_seq_cst_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -4245,6 +4298,7 @@ define amdgpu_kernel void @global_wavefront_release_acquire_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; GFX6-NEXT: s_endpgm ; @@ -4273,6 +4327,7 @@ define amdgpu_kernel void @global_wavefront_release_acquire_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm ; @@ -4323,6 +4378,7 @@ define amdgpu_kernel void @global_wavefront_release_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -4337,6 +4393,7 @@ define amdgpu_kernel void @global_wavefront_release_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -4365,6 +4422,7 @@ define amdgpu_kernel void @global_wavefront_release_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -4464,6 +4522,7 @@ define amdgpu_kernel void @global_wavefront_acq_rel_acquire_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; GFX6-NEXT: s_endpgm ; @@ -4492,6 +4551,7 @@ define amdgpu_kernel void @global_wavefront_acq_rel_acquire_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm ; @@ -4542,6 +4602,7 @@ define amdgpu_kernel void @global_wavefront_acq_rel_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -4556,6 +4617,7 @@ define amdgpu_kernel void @global_wavefront_acq_rel_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -4584,6 +4646,7 @@ define amdgpu_kernel void @global_wavefront_acq_rel_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -4683,6 +4746,7 @@ define amdgpu_kernel void @global_wavefront_seq_cst_acquire_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; GFX6-NEXT: s_endpgm ; @@ -4711,6 +4775,7 @@ define amdgpu_kernel void @global_wavefront_seq_cst_acquire_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm ; @@ -4761,6 +4826,7 @@ define amdgpu_kernel void @global_wavefront_seq_cst_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -4775,6 +4841,7 @@ define amdgpu_kernel void @global_wavefront_seq_cst_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -4803,6 +4870,7 @@ define amdgpu_kernel void @global_wavefront_seq_cst_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -4902,6 +4970,7 @@ define amdgpu_kernel void @global_wavefront_monotonic_seq_cst_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; GFX6-NEXT: s_endpgm ; @@ -4930,6 +4999,7 @@ define amdgpu_kernel void @global_wavefront_monotonic_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm ; @@ -4980,6 +5050,7 @@ define amdgpu_kernel void @global_wavefront_monotonic_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -4994,6 +5065,7 @@ define amdgpu_kernel void @global_wavefront_monotonic_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -5022,6 +5094,7 @@ define amdgpu_kernel void @global_wavefront_monotonic_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -5121,6 +5194,7 @@ define amdgpu_kernel void @global_wavefront_acquire_seq_cst_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; GFX6-NEXT: s_endpgm ; @@ -5149,6 +5223,7 @@ define amdgpu_kernel void @global_wavefront_acquire_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm ; @@ -5199,6 +5274,7 @@ define amdgpu_kernel void @global_wavefront_acquire_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -5213,6 +5289,7 @@ define amdgpu_kernel void @global_wavefront_acquire_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -5241,6 +5318,7 @@ define amdgpu_kernel void @global_wavefront_acquire_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -5340,6 +5418,7 @@ define amdgpu_kernel void @global_wavefront_release_seq_cst_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; GFX6-NEXT: s_endpgm ; @@ -5368,6 +5447,7 @@ define amdgpu_kernel void @global_wavefront_release_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm ; @@ -5418,6 +5498,7 @@ define amdgpu_kernel void @global_wavefront_release_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -5432,6 +5513,7 @@ define amdgpu_kernel void @global_wavefront_release_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -5460,6 +5542,7 @@ define amdgpu_kernel void @global_wavefront_release_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -5559,6 +5642,7 @@ define amdgpu_kernel void @global_wavefront_acq_rel_seq_cst_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; GFX6-NEXT: s_endpgm ; @@ -5587,6 +5671,7 @@ define amdgpu_kernel void @global_wavefront_acq_rel_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm ; @@ -5637,6 +5722,7 @@ define amdgpu_kernel void @global_wavefront_acq_rel_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -5651,6 +5737,7 @@ define amdgpu_kernel void @global_wavefront_acq_rel_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -5679,6 +5766,7 @@ define amdgpu_kernel void @global_wavefront_acq_rel_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -5778,6 +5866,7 @@ define amdgpu_kernel void @global_wavefront_seq_cst_seq_cst_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; GFX6-NEXT: s_endpgm ; @@ -5806,6 +5895,7 @@ define amdgpu_kernel void @global_wavefront_seq_cst_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm ; @@ -5856,6 +5946,7 @@ define amdgpu_kernel void @global_wavefront_seq_cst_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -5870,6 +5961,7 @@ define amdgpu_kernel void @global_wavefront_seq_cst_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -5898,6 +5990,7 @@ define amdgpu_kernel void @global_wavefront_seq_cst_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -6499,6 +6592,7 @@ define amdgpu_kernel void @global_wavefront_release_monotonic_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -6530,6 +6624,7 @@ define amdgpu_kernel void @global_wavefront_release_monotonic_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -6588,6 +6683,7 @@ define amdgpu_kernel void @global_wavefront_release_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) @@ -6605,6 +6701,7 @@ define amdgpu_kernel void @global_wavefront_release_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] @@ -6637,6 +6734,7 @@ define amdgpu_kernel void @global_wavefront_release_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] @@ -6750,6 +6848,7 @@ define amdgpu_kernel void @global_wavefront_acq_rel_monotonic_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -6781,6 +6880,7 @@ define amdgpu_kernel void @global_wavefront_acq_rel_monotonic_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -6839,6 +6939,7 @@ define amdgpu_kernel void @global_wavefront_acq_rel_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) @@ -6856,6 +6957,7 @@ define amdgpu_kernel void @global_wavefront_acq_rel_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] @@ -6888,6 +6990,7 @@ define amdgpu_kernel void @global_wavefront_acq_rel_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] @@ -7001,6 +7104,7 @@ define amdgpu_kernel void @global_wavefront_seq_cst_monotonic_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -7032,6 +7136,7 @@ define amdgpu_kernel void @global_wavefront_seq_cst_monotonic_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -7090,6 +7195,7 @@ define amdgpu_kernel void @global_wavefront_seq_cst_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) @@ -7107,6 +7213,7 @@ define amdgpu_kernel void @global_wavefront_seq_cst_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] @@ -7139,6 +7246,7 @@ define amdgpu_kernel void @global_wavefront_seq_cst_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] @@ -7754,6 +7862,7 @@ define amdgpu_kernel void @global_wavefront_release_acquire_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -7785,6 +7894,7 @@ define amdgpu_kernel void @global_wavefront_release_acquire_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -7843,6 +7953,7 @@ define amdgpu_kernel void @global_wavefront_release_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) @@ -7860,6 +7971,7 @@ define amdgpu_kernel void @global_wavefront_release_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] @@ -7892,6 +8004,7 @@ define amdgpu_kernel void @global_wavefront_release_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] @@ -8005,6 +8118,7 @@ define amdgpu_kernel void @global_wavefront_acq_rel_acquire_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -8036,6 +8150,7 @@ define amdgpu_kernel void @global_wavefront_acq_rel_acquire_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -8094,6 +8209,7 @@ define amdgpu_kernel void @global_wavefront_acq_rel_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) @@ -8111,6 +8227,7 @@ define amdgpu_kernel void @global_wavefront_acq_rel_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] @@ -8143,6 +8260,7 @@ define amdgpu_kernel void @global_wavefront_acq_rel_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] @@ -8256,6 +8374,7 @@ define amdgpu_kernel void @global_wavefront_seq_cst_acquire_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -8287,6 +8406,7 @@ define amdgpu_kernel void @global_wavefront_seq_cst_acquire_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -8345,6 +8465,7 @@ define amdgpu_kernel void @global_wavefront_seq_cst_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) @@ -8362,6 +8483,7 @@ define amdgpu_kernel void @global_wavefront_seq_cst_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] @@ -8394,6 +8516,7 @@ define amdgpu_kernel void @global_wavefront_seq_cst_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] @@ -8507,6 +8630,7 @@ define amdgpu_kernel void @global_wavefront_monotonic_seq_cst_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -8538,6 +8662,7 @@ define amdgpu_kernel void @global_wavefront_monotonic_seq_cst_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -8596,6 +8721,7 @@ define amdgpu_kernel void @global_wavefront_monotonic_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) @@ -8613,6 +8739,7 @@ define amdgpu_kernel void @global_wavefront_monotonic_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] @@ -8645,6 +8772,7 @@ define amdgpu_kernel void @global_wavefront_monotonic_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] @@ -8758,6 +8886,7 @@ define amdgpu_kernel void @global_wavefront_acquire_seq_cst_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -8789,6 +8918,7 @@ define amdgpu_kernel void @global_wavefront_acquire_seq_cst_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -8847,6 +8977,7 @@ define amdgpu_kernel void @global_wavefront_acquire_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) @@ -8864,6 +8995,7 @@ define amdgpu_kernel void @global_wavefront_acquire_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] @@ -8896,6 +9028,7 @@ define amdgpu_kernel void @global_wavefront_acquire_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] @@ -9009,6 +9142,7 @@ define amdgpu_kernel void @global_wavefront_release_seq_cst_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -9040,6 +9174,7 @@ define amdgpu_kernel void @global_wavefront_release_seq_cst_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -9098,6 +9233,7 @@ define amdgpu_kernel void @global_wavefront_release_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) @@ -9115,6 +9251,7 @@ define amdgpu_kernel void @global_wavefront_release_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] @@ -9147,6 +9284,7 @@ define amdgpu_kernel void @global_wavefront_release_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] @@ -9260,6 +9398,7 @@ define amdgpu_kernel void @global_wavefront_acq_rel_seq_cst_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -9291,6 +9430,7 @@ define amdgpu_kernel void @global_wavefront_acq_rel_seq_cst_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -9349,6 +9489,7 @@ define amdgpu_kernel void @global_wavefront_acq_rel_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) @@ -9366,6 +9507,7 @@ define amdgpu_kernel void @global_wavefront_acq_rel_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] @@ -9398,6 +9540,7 @@ define amdgpu_kernel void @global_wavefront_acq_rel_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] @@ -9511,6 +9654,7 @@ define amdgpu_kernel void @global_wavefront_seq_cst_seq_cst_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -9542,6 +9686,7 @@ define amdgpu_kernel void @global_wavefront_seq_cst_seq_cst_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -9600,6 +9745,7 @@ define amdgpu_kernel void @global_wavefront_seq_cst_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) @@ -9617,6 +9763,7 @@ define amdgpu_kernel void @global_wavefront_seq_cst_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] @@ -9649,6 +9796,7 @@ define amdgpu_kernel void @global_wavefront_seq_cst_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-workgroup.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-workgroup.ll index 69b0c7f93ab0e..d64474a92847b 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-workgroup.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-workgroup.ll @@ -591,7 +591,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_load( ; GFX6-NEXT: s_mov_b32 s5, s14 ; GFX6-NEXT: s_mov_b32 s6, s13 ; GFX6-NEXT: s_mov_b32 s7, s12 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: buffer_load_dword v0, off, s[8:11], 0 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 @@ -607,7 +607,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_load( ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: flat_load_dword v2, v[0:1] ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -659,7 +659,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_load( ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s10 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s9 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s8 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -670,7 +670,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_load( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] @@ -693,7 +693,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_load( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[2:3] sc0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] @@ -1099,7 +1099,7 @@ define amdgpu_kernel void @global_workgroup_release_store( ; GFX6-NEXT: s_mov_b32 s6, s10 ; GFX6-NEXT: s_mov_b32 s7, s9 ; GFX6-NEXT: v_mov_b32_e32 v0, s8 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; @@ -1114,7 +1114,7 @@ define amdgpu_kernel void @global_workgroup_release_store( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -1156,7 +1156,7 @@ define amdgpu_kernel void @global_workgroup_release_store( ; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s6 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s5 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -1167,7 +1167,7 @@ define amdgpu_kernel void @global_workgroup_release_store( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -1189,7 +1189,7 @@ define amdgpu_kernel void @global_workgroup_release_store( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] sc0 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -1273,7 +1273,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_store( ; GFX6-NEXT: s_mov_b32 s6, s10 ; GFX6-NEXT: s_mov_b32 s7, s9 ; GFX6-NEXT: v_mov_b32_e32 v0, s8 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; @@ -1288,7 +1288,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_store( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -1330,7 +1330,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_store( ; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s6 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s5 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -1341,7 +1341,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_store( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -1363,7 +1363,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_store( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] sc0 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -1764,7 +1764,7 @@ define amdgpu_kernel void @global_workgroup_release_atomicrmw( ; GFX6-NEXT: s_mov_b32 s6, s10 ; GFX6-NEXT: s_mov_b32 s7, s9 ; GFX6-NEXT: v_mov_b32_e32 v0, s8 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; @@ -1779,7 +1779,7 @@ define amdgpu_kernel void @global_workgroup_release_atomicrmw( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: flat_atomic_swap v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -1820,7 +1820,7 @@ define amdgpu_kernel void @global_workgroup_release_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s6 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s5 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -1831,7 +1831,7 @@ define amdgpu_kernel void @global_workgroup_release_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[4:5] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -1853,7 +1853,7 @@ define amdgpu_kernel void @global_workgroup_release_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -1936,7 +1936,7 @@ define amdgpu_kernel void @global_workgroup_acq_rel_atomicrmw( ; GFX6-NEXT: s_mov_b32 s6, s10 ; GFX6-NEXT: s_mov_b32 s7, s9 ; GFX6-NEXT: v_mov_b32_e32 v0, s8 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; @@ -1951,7 +1951,7 @@ define amdgpu_kernel void @global_workgroup_acq_rel_atomicrmw( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: flat_atomic_swap v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -1994,7 +1994,7 @@ define amdgpu_kernel void @global_workgroup_acq_rel_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s6 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s5 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -2005,7 +2005,7 @@ define amdgpu_kernel void @global_workgroup_acq_rel_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[4:5] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -2029,7 +2029,7 @@ define amdgpu_kernel void @global_workgroup_acq_rel_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -2118,7 +2118,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_atomicrmw( ; GFX6-NEXT: s_mov_b32 s6, s10 ; GFX6-NEXT: s_mov_b32 s7, s9 ; GFX6-NEXT: v_mov_b32_e32 v0, s8 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; @@ -2133,7 +2133,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_atomicrmw( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: flat_atomic_swap v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -2176,7 +2176,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s6 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s5 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -2187,7 +2187,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[4:5] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -2211,7 +2211,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -2488,7 +2488,7 @@ define amdgpu_kernel void @global_workgroup_acq_rel_ret_atomicrmw( ; GFX6-NEXT: s_mov_b32 s6, s10 ; GFX6-NEXT: s_mov_b32 s7, s9 ; GFX6-NEXT: v_mov_b32_e32 v0, s8 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 @@ -2505,7 +2505,7 @@ define amdgpu_kernel void @global_workgroup_acq_rel_ret_atomicrmw( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s6 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -2555,7 +2555,7 @@ define amdgpu_kernel void @global_workgroup_acq_rel_ret_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s6 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s5 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -2568,7 +2568,7 @@ define amdgpu_kernel void @global_workgroup_acq_rel_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[4:5] glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] @@ -2595,7 +2595,7 @@ define amdgpu_kernel void @global_workgroup_acq_rel_ret_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[0:1] sc0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] @@ -2696,7 +2696,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_ret_atomicrmw( ; GFX6-NEXT: s_mov_b32 s6, s10 ; GFX6-NEXT: s_mov_b32 s7, s9 ; GFX6-NEXT: v_mov_b32_e32 v0, s8 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 @@ -2713,7 +2713,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_ret_atomicrmw( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s6 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -2763,7 +2763,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_ret_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s6 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s5 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -2776,7 +2776,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[4:5] glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] @@ -2803,7 +2803,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_ret_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[0:1] sc0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] @@ -3357,7 +3357,7 @@ define amdgpu_kernel void @global_workgroup_release_monotonic_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; GFX6-NEXT: s_endpgm ; @@ -3386,7 +3386,7 @@ define amdgpu_kernel void @global_workgroup_release_monotonic_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm ; @@ -3440,7 +3440,7 @@ define amdgpu_kernel void @global_workgroup_release_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -3455,7 +3455,7 @@ define amdgpu_kernel void @global_workgroup_release_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -3485,7 +3485,7 @@ define amdgpu_kernel void @global_workgroup_release_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -3594,7 +3594,7 @@ define amdgpu_kernel void @global_workgroup_acq_rel_monotonic_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; GFX6-NEXT: s_endpgm ; @@ -3623,7 +3623,7 @@ define amdgpu_kernel void @global_workgroup_acq_rel_monotonic_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm ; @@ -3679,7 +3679,7 @@ define amdgpu_kernel void @global_workgroup_acq_rel_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -3694,7 +3694,7 @@ define amdgpu_kernel void @global_workgroup_acq_rel_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -3726,7 +3726,7 @@ define amdgpu_kernel void @global_workgroup_acq_rel_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -3841,7 +3841,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_monotonic_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; GFX6-NEXT: s_endpgm ; @@ -3870,7 +3870,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_monotonic_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm ; @@ -3926,7 +3926,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -3941,7 +3941,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -3973,7 +3973,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -4546,7 +4546,7 @@ define amdgpu_kernel void @global_workgroup_release_acquire_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; GFX6-NEXT: s_endpgm ; @@ -4575,7 +4575,7 @@ define amdgpu_kernel void @global_workgroup_release_acquire_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm ; @@ -4631,7 +4631,7 @@ define amdgpu_kernel void @global_workgroup_release_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -4646,7 +4646,7 @@ define amdgpu_kernel void @global_workgroup_release_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -4678,7 +4678,7 @@ define amdgpu_kernel void @global_workgroup_release_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -4793,7 +4793,7 @@ define amdgpu_kernel void @global_workgroup_acq_rel_acquire_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; GFX6-NEXT: s_endpgm ; @@ -4822,7 +4822,7 @@ define amdgpu_kernel void @global_workgroup_acq_rel_acquire_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm ; @@ -4878,7 +4878,7 @@ define amdgpu_kernel void @global_workgroup_acq_rel_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -4893,7 +4893,7 @@ define amdgpu_kernel void @global_workgroup_acq_rel_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -4925,7 +4925,7 @@ define amdgpu_kernel void @global_workgroup_acq_rel_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -5040,7 +5040,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_acquire_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; GFX6-NEXT: s_endpgm ; @@ -5069,7 +5069,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_acquire_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm ; @@ -5125,7 +5125,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -5140,7 +5140,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -5172,7 +5172,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -5287,7 +5287,7 @@ define amdgpu_kernel void @global_workgroup_monotonic_seq_cst_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; GFX6-NEXT: s_endpgm ; @@ -5316,7 +5316,7 @@ define amdgpu_kernel void @global_workgroup_monotonic_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm ; @@ -5372,7 +5372,7 @@ define amdgpu_kernel void @global_workgroup_monotonic_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -5387,7 +5387,7 @@ define amdgpu_kernel void @global_workgroup_monotonic_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -5419,7 +5419,7 @@ define amdgpu_kernel void @global_workgroup_monotonic_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -5534,7 +5534,7 @@ define amdgpu_kernel void @global_workgroup_acquire_seq_cst_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; GFX6-NEXT: s_endpgm ; @@ -5563,7 +5563,7 @@ define amdgpu_kernel void @global_workgroup_acquire_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm ; @@ -5619,7 +5619,7 @@ define amdgpu_kernel void @global_workgroup_acquire_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -5634,7 +5634,7 @@ define amdgpu_kernel void @global_workgroup_acquire_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -5666,7 +5666,7 @@ define amdgpu_kernel void @global_workgroup_acquire_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -5781,7 +5781,7 @@ define amdgpu_kernel void @global_workgroup_release_seq_cst_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; GFX6-NEXT: s_endpgm ; @@ -5810,7 +5810,7 @@ define amdgpu_kernel void @global_workgroup_release_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm ; @@ -5866,7 +5866,7 @@ define amdgpu_kernel void @global_workgroup_release_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -5881,7 +5881,7 @@ define amdgpu_kernel void @global_workgroup_release_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -5913,7 +5913,7 @@ define amdgpu_kernel void @global_workgroup_release_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -6028,7 +6028,7 @@ define amdgpu_kernel void @global_workgroup_acq_rel_seq_cst_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; GFX6-NEXT: s_endpgm ; @@ -6057,7 +6057,7 @@ define amdgpu_kernel void @global_workgroup_acq_rel_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm ; @@ -6113,7 +6113,7 @@ define amdgpu_kernel void @global_workgroup_acq_rel_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -6128,7 +6128,7 @@ define amdgpu_kernel void @global_workgroup_acq_rel_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -6160,7 +6160,7 @@ define amdgpu_kernel void @global_workgroup_acq_rel_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -6275,7 +6275,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_seq_cst_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; GFX6-NEXT: s_endpgm ; @@ -6304,7 +6304,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm ; @@ -6360,7 +6360,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -6375,7 +6375,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -6407,7 +6407,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -7029,7 +7029,7 @@ define amdgpu_kernel void @global_workgroup_release_monotonic_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -7061,7 +7061,7 @@ define amdgpu_kernel void @global_workgroup_release_monotonic_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -7123,7 +7123,7 @@ define amdgpu_kernel void @global_workgroup_release_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) @@ -7141,7 +7141,7 @@ define amdgpu_kernel void @global_workgroup_release_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] @@ -7175,7 +7175,7 @@ define amdgpu_kernel void @global_workgroup_release_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] @@ -7298,7 +7298,7 @@ define amdgpu_kernel void @global_workgroup_acq_rel_monotonic_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -7330,7 +7330,7 @@ define amdgpu_kernel void @global_workgroup_acq_rel_monotonic_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -7393,7 +7393,7 @@ define amdgpu_kernel void @global_workgroup_acq_rel_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) @@ -7411,7 +7411,7 @@ define amdgpu_kernel void @global_workgroup_acq_rel_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] @@ -7446,7 +7446,7 @@ define amdgpu_kernel void @global_workgroup_acq_rel_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] @@ -7574,7 +7574,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_monotonic_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -7606,7 +7606,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_monotonic_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -7669,7 +7669,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) @@ -7687,7 +7687,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] @@ -7722,7 +7722,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] @@ -8364,7 +8364,7 @@ define amdgpu_kernel void @global_workgroup_release_acquire_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -8396,7 +8396,7 @@ define amdgpu_kernel void @global_workgroup_release_acquire_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -8459,7 +8459,7 @@ define amdgpu_kernel void @global_workgroup_release_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) @@ -8477,7 +8477,7 @@ define amdgpu_kernel void @global_workgroup_release_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] @@ -8512,7 +8512,7 @@ define amdgpu_kernel void @global_workgroup_release_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] @@ -8640,7 +8640,7 @@ define amdgpu_kernel void @global_workgroup_acq_rel_acquire_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -8672,7 +8672,7 @@ define amdgpu_kernel void @global_workgroup_acq_rel_acquire_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -8735,7 +8735,7 @@ define amdgpu_kernel void @global_workgroup_acq_rel_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) @@ -8753,7 +8753,7 @@ define amdgpu_kernel void @global_workgroup_acq_rel_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] @@ -8788,7 +8788,7 @@ define amdgpu_kernel void @global_workgroup_acq_rel_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] @@ -8916,7 +8916,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_acquire_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -8948,7 +8948,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_acquire_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -9011,7 +9011,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) @@ -9029,7 +9029,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] @@ -9064,7 +9064,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] @@ -9192,7 +9192,7 @@ define amdgpu_kernel void @global_workgroup_monotonic_seq_cst_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -9224,7 +9224,7 @@ define amdgpu_kernel void @global_workgroup_monotonic_seq_cst_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -9287,7 +9287,7 @@ define amdgpu_kernel void @global_workgroup_monotonic_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) @@ -9305,7 +9305,7 @@ define amdgpu_kernel void @global_workgroup_monotonic_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] @@ -9340,7 +9340,7 @@ define amdgpu_kernel void @global_workgroup_monotonic_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] @@ -9468,7 +9468,7 @@ define amdgpu_kernel void @global_workgroup_acquire_seq_cst_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -9500,7 +9500,7 @@ define amdgpu_kernel void @global_workgroup_acquire_seq_cst_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -9563,7 +9563,7 @@ define amdgpu_kernel void @global_workgroup_acquire_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) @@ -9581,7 +9581,7 @@ define amdgpu_kernel void @global_workgroup_acquire_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] @@ -9616,7 +9616,7 @@ define amdgpu_kernel void @global_workgroup_acquire_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] @@ -9742,7 +9742,7 @@ define amdgpu_kernel void @global_workgroup_release_seq_cst_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -9774,7 +9774,7 @@ define amdgpu_kernel void @global_workgroup_release_seq_cst_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -9837,7 +9837,7 @@ define amdgpu_kernel void @global_workgroup_release_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) @@ -9855,7 +9855,7 @@ define amdgpu_kernel void @global_workgroup_release_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] @@ -9890,7 +9890,7 @@ define amdgpu_kernel void @global_workgroup_release_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] @@ -10018,7 +10018,7 @@ define amdgpu_kernel void @global_workgroup_acq_rel_seq_cst_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -10050,7 +10050,7 @@ define amdgpu_kernel void @global_workgroup_acq_rel_seq_cst_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -10113,7 +10113,7 @@ define amdgpu_kernel void @global_workgroup_acq_rel_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) @@ -10131,7 +10131,7 @@ define amdgpu_kernel void @global_workgroup_acq_rel_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] @@ -10166,7 +10166,7 @@ define amdgpu_kernel void @global_workgroup_acq_rel_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] @@ -10294,7 +10294,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_seq_cst_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -10326,7 +10326,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_seq_cst_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -10389,7 +10389,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) @@ -10407,7 +10407,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] @@ -10442,7 +10442,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-agent.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-agent.ll index 0467c5047a0be..24a859869bc08 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-agent.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-agent.ll @@ -366,7 +366,7 @@ define amdgpu_kernel void @local_agent_acquire_load( ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: ds_read_b32 v1, v0 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: ds_write_b32 v0, v1 @@ -380,7 +380,7 @@ define amdgpu_kernel void @local_agent_acquire_load( ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: ds_read_b32 v1, v0 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: ds_write_b32 v0, v1 @@ -419,7 +419,7 @@ define amdgpu_kernel void @local_agent_acquire_load( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: ds_read_b32 v1, v0 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 @@ -432,7 +432,7 @@ define amdgpu_kernel void @local_agent_acquire_load( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: ds_read_b32 v1, v0 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm @@ -457,7 +457,7 @@ define amdgpu_kernel void @local_agent_acquire_load( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: ds_read_b32 v1, v0 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm @@ -541,9 +541,9 @@ define amdgpu_kernel void @local_agent_seq_cst_load( ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: ds_read_b32 v1, v0 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: ds_write_b32 v0, v1 @@ -556,9 +556,9 @@ define amdgpu_kernel void @local_agent_seq_cst_load( ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: ds_read_b32 v1, v0 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: ds_write_b32 v0, v1 @@ -599,9 +599,9 @@ define amdgpu_kernel void @local_agent_seq_cst_load( ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_read_b32 v1, v0 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 @@ -613,9 +613,9 @@ define amdgpu_kernel void @local_agent_seq_cst_load( ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_read_b32 v1, v0 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm @@ -640,9 +640,9 @@ define amdgpu_kernel void @local_agent_seq_cst_load( ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_read_b32 v1, v0 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm @@ -1015,7 +1015,7 @@ define amdgpu_kernel void @local_agent_release_store( ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm ; @@ -1027,7 +1027,7 @@ define amdgpu_kernel void @local_agent_release_store( ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: v_mov_b32_e32 v1, s4 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: ds_write_b32 v0, v1 ; GFX7-NEXT: s_endpgm ; @@ -1062,7 +1062,7 @@ define amdgpu_kernel void @local_agent_release_store( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -1073,7 +1073,7 @@ define amdgpu_kernel void @local_agent_release_store( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -1095,7 +1095,7 @@ define amdgpu_kernel void @local_agent_release_store( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -1173,7 +1173,7 @@ define amdgpu_kernel void @local_agent_seq_cst_store( ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm ; @@ -1185,7 +1185,7 @@ define amdgpu_kernel void @local_agent_seq_cst_store( ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: v_mov_b32_e32 v1, s4 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: ds_write_b32 v0, v1 ; GFX7-NEXT: s_endpgm ; @@ -1220,7 +1220,7 @@ define amdgpu_kernel void @local_agent_seq_cst_store( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -1231,7 +1231,7 @@ define amdgpu_kernel void @local_agent_seq_cst_store( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -1253,7 +1253,7 @@ define amdgpu_kernel void @local_agent_seq_cst_store( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -1472,7 +1472,7 @@ define amdgpu_kernel void @local_agent_acquire_atomicrmw( ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_agent_acquire_atomicrmw: @@ -1484,7 +1484,7 @@ define amdgpu_kernel void @local_agent_acquire_atomicrmw( ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: v_mov_b32_e32 v1, s4 ; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_agent_acquire_atomicrmw: @@ -1519,7 +1519,7 @@ define amdgpu_kernel void @local_agent_acquire_atomicrmw( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_agent_acquire_atomicrmw: @@ -1530,7 +1530,7 @@ define amdgpu_kernel void @local_agent_acquire_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_agent_acquire_atomicrmw: @@ -1552,7 +1552,7 @@ define amdgpu_kernel void @local_agent_acquire_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_agent_acquire_atomicrmw: @@ -1627,7 +1627,7 @@ define amdgpu_kernel void @local_agent_release_atomicrmw( ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX6-NEXT: s_endpgm ; @@ -1639,7 +1639,7 @@ define amdgpu_kernel void @local_agent_release_atomicrmw( ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: v_mov_b32_e32 v1, s4 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX7-NEXT: s_endpgm ; @@ -1674,7 +1674,7 @@ define amdgpu_kernel void @local_agent_release_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -1685,7 +1685,7 @@ define amdgpu_kernel void @local_agent_release_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -1707,7 +1707,7 @@ define amdgpu_kernel void @local_agent_release_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -1785,9 +1785,9 @@ define amdgpu_kernel void @local_agent_acq_rel_atomicrmw( ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_agent_acq_rel_atomicrmw: @@ -1798,9 +1798,9 @@ define amdgpu_kernel void @local_agent_acq_rel_atomicrmw( ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: v_mov_b32_e32 v1, s4 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_agent_acq_rel_atomicrmw: @@ -1837,9 +1837,9 @@ define amdgpu_kernel void @local_agent_acq_rel_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_agent_acq_rel_atomicrmw: @@ -1849,9 +1849,9 @@ define amdgpu_kernel void @local_agent_acq_rel_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_agent_acq_rel_atomicrmw: @@ -1873,9 +1873,9 @@ define amdgpu_kernel void @local_agent_acq_rel_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_agent_acq_rel_atomicrmw: @@ -1959,9 +1959,9 @@ define amdgpu_kernel void @local_agent_seq_cst_atomicrmw( ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_agent_seq_cst_atomicrmw: @@ -1972,9 +1972,9 @@ define amdgpu_kernel void @local_agent_seq_cst_atomicrmw( ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: v_mov_b32_e32 v1, s4 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_agent_seq_cst_atomicrmw: @@ -2011,9 +2011,9 @@ define amdgpu_kernel void @local_agent_seq_cst_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_agent_seq_cst_atomicrmw: @@ -2023,9 +2023,9 @@ define amdgpu_kernel void @local_agent_seq_cst_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_agent_seq_cst_atomicrmw: @@ -2047,9 +2047,9 @@ define amdgpu_kernel void @local_agent_seq_cst_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_agent_seq_cst_atomicrmw: @@ -2134,7 +2134,7 @@ define amdgpu_kernel void @local_agent_acquire_ret_atomicrmw( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: ds_write_b32 v0, v1 @@ -2149,7 +2149,7 @@ define amdgpu_kernel void @local_agent_acquire_ret_atomicrmw( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: ds_write_b32 v0, v1 @@ -2191,7 +2191,7 @@ define amdgpu_kernel void @local_agent_acquire_ret_atomicrmw( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 @@ -2205,7 +2205,7 @@ define amdgpu_kernel void @local_agent_acquire_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm @@ -2232,7 +2232,7 @@ define amdgpu_kernel void @local_agent_acquire_ret_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm @@ -2321,9 +2321,9 @@ define amdgpu_kernel void @local_agent_acq_rel_ret_atomicrmw( ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: ds_write_b32 v0, v1 @@ -2337,9 +2337,9 @@ define amdgpu_kernel void @local_agent_acq_rel_ret_atomicrmw( ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: ds_write_b32 v0, v1 @@ -2383,9 +2383,9 @@ define amdgpu_kernel void @local_agent_acq_rel_ret_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 @@ -2398,9 +2398,9 @@ define amdgpu_kernel void @local_agent_acq_rel_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm @@ -2427,9 +2427,9 @@ define amdgpu_kernel void @local_agent_acq_rel_ret_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm @@ -2527,9 +2527,9 @@ define amdgpu_kernel void @local_agent_seq_cst_ret_atomicrmw( ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: ds_write_b32 v0, v1 @@ -2543,9 +2543,9 @@ define amdgpu_kernel void @local_agent_seq_cst_ret_atomicrmw( ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: ds_write_b32 v0, v1 @@ -2589,9 +2589,9 @@ define amdgpu_kernel void @local_agent_seq_cst_ret_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 @@ -2604,9 +2604,9 @@ define amdgpu_kernel void @local_agent_seq_cst_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm @@ -2633,9 +2633,9 @@ define amdgpu_kernel void @local_agent_seq_cst_ret_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm @@ -2903,7 +2903,7 @@ define amdgpu_kernel void @local_agent_acquire_monotonic_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_agent_acquire_monotonic_cmpxchg: @@ -2917,7 +2917,7 @@ define amdgpu_kernel void @local_agent_acquire_monotonic_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_agent_acquire_monotonic_cmpxchg: @@ -2958,7 +2958,7 @@ define amdgpu_kernel void @local_agent_acquire_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_agent_acquire_monotonic_cmpxchg: @@ -2971,7 +2971,7 @@ define amdgpu_kernel void @local_agent_acquire_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_agent_acquire_monotonic_cmpxchg: @@ -2997,7 +2997,7 @@ define amdgpu_kernel void @local_agent_acquire_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_agent_acquire_monotonic_cmpxchg: @@ -3085,7 +3085,7 @@ define amdgpu_kernel void @local_agent_release_monotonic_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX6-NEXT: s_endpgm ; @@ -3099,7 +3099,7 @@ define amdgpu_kernel void @local_agent_release_monotonic_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX7-NEXT: s_endpgm ; @@ -3140,7 +3140,7 @@ define amdgpu_kernel void @local_agent_release_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -3153,7 +3153,7 @@ define amdgpu_kernel void @local_agent_release_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -3179,7 +3179,7 @@ define amdgpu_kernel void @local_agent_release_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -3270,9 +3270,9 @@ define amdgpu_kernel void @local_agent_acq_rel_monotonic_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_agent_acq_rel_monotonic_cmpxchg: @@ -3285,9 +3285,9 @@ define amdgpu_kernel void @local_agent_acq_rel_monotonic_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_agent_acq_rel_monotonic_cmpxchg: @@ -3330,9 +3330,9 @@ define amdgpu_kernel void @local_agent_acq_rel_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_agent_acq_rel_monotonic_cmpxchg: @@ -3344,9 +3344,9 @@ define amdgpu_kernel void @local_agent_acq_rel_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_agent_acq_rel_monotonic_cmpxchg: @@ -3372,9 +3372,9 @@ define amdgpu_kernel void @local_agent_acq_rel_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_agent_acq_rel_monotonic_cmpxchg: @@ -3471,9 +3471,9 @@ define amdgpu_kernel void @local_agent_seq_cst_monotonic_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_agent_seq_cst_monotonic_cmpxchg: @@ -3486,9 +3486,9 @@ define amdgpu_kernel void @local_agent_seq_cst_monotonic_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_agent_seq_cst_monotonic_cmpxchg: @@ -3531,9 +3531,9 @@ define amdgpu_kernel void @local_agent_seq_cst_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_agent_seq_cst_monotonic_cmpxchg: @@ -3545,9 +3545,9 @@ define amdgpu_kernel void @local_agent_seq_cst_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_agent_seq_cst_monotonic_cmpxchg: @@ -3573,9 +3573,9 @@ define amdgpu_kernel void @local_agent_seq_cst_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_agent_seq_cst_monotonic_cmpxchg: @@ -3673,7 +3673,7 @@ define amdgpu_kernel void @local_agent_monotonic_acquire_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_agent_monotonic_acquire_cmpxchg: @@ -3687,7 +3687,7 @@ define amdgpu_kernel void @local_agent_monotonic_acquire_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_agent_monotonic_acquire_cmpxchg: @@ -3728,7 +3728,7 @@ define amdgpu_kernel void @local_agent_monotonic_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_agent_monotonic_acquire_cmpxchg: @@ -3741,7 +3741,7 @@ define amdgpu_kernel void @local_agent_monotonic_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_agent_monotonic_acquire_cmpxchg: @@ -3767,7 +3767,7 @@ define amdgpu_kernel void @local_agent_monotonic_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_agent_monotonic_acquire_cmpxchg: @@ -3856,7 +3856,7 @@ define amdgpu_kernel void @local_agent_acquire_acquire_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_agent_acquire_acquire_cmpxchg: @@ -3870,7 +3870,7 @@ define amdgpu_kernel void @local_agent_acquire_acquire_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_agent_acquire_acquire_cmpxchg: @@ -3911,7 +3911,7 @@ define amdgpu_kernel void @local_agent_acquire_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_agent_acquire_acquire_cmpxchg: @@ -3924,7 +3924,7 @@ define amdgpu_kernel void @local_agent_acquire_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_agent_acquire_acquire_cmpxchg: @@ -3950,7 +3950,7 @@ define amdgpu_kernel void @local_agent_acquire_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_agent_acquire_acquire_cmpxchg: @@ -4038,9 +4038,9 @@ define amdgpu_kernel void @local_agent_release_acquire_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_agent_release_acquire_cmpxchg: @@ -4053,9 +4053,9 @@ define amdgpu_kernel void @local_agent_release_acquire_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_agent_release_acquire_cmpxchg: @@ -4098,9 +4098,9 @@ define amdgpu_kernel void @local_agent_release_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_agent_release_acquire_cmpxchg: @@ -4112,9 +4112,9 @@ define amdgpu_kernel void @local_agent_release_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_agent_release_acquire_cmpxchg: @@ -4140,9 +4140,9 @@ define amdgpu_kernel void @local_agent_release_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_agent_release_acquire_cmpxchg: @@ -4239,9 +4239,9 @@ define amdgpu_kernel void @local_agent_acq_rel_acquire_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_agent_acq_rel_acquire_cmpxchg: @@ -4254,9 +4254,9 @@ define amdgpu_kernel void @local_agent_acq_rel_acquire_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_agent_acq_rel_acquire_cmpxchg: @@ -4299,9 +4299,9 @@ define amdgpu_kernel void @local_agent_acq_rel_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_agent_acq_rel_acquire_cmpxchg: @@ -4313,9 +4313,9 @@ define amdgpu_kernel void @local_agent_acq_rel_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_agent_acq_rel_acquire_cmpxchg: @@ -4341,9 +4341,9 @@ define amdgpu_kernel void @local_agent_acq_rel_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_agent_acq_rel_acquire_cmpxchg: @@ -4440,9 +4440,9 @@ define amdgpu_kernel void @local_agent_seq_cst_acquire_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_agent_seq_cst_acquire_cmpxchg: @@ -4455,9 +4455,9 @@ define amdgpu_kernel void @local_agent_seq_cst_acquire_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_agent_seq_cst_acquire_cmpxchg: @@ -4500,9 +4500,9 @@ define amdgpu_kernel void @local_agent_seq_cst_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_agent_seq_cst_acquire_cmpxchg: @@ -4514,9 +4514,9 @@ define amdgpu_kernel void @local_agent_seq_cst_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_agent_seq_cst_acquire_cmpxchg: @@ -4542,9 +4542,9 @@ define amdgpu_kernel void @local_agent_seq_cst_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_agent_seq_cst_acquire_cmpxchg: @@ -4641,9 +4641,9 @@ define amdgpu_kernel void @local_agent_monotonic_seq_cst_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_agent_monotonic_seq_cst_cmpxchg: @@ -4656,9 +4656,9 @@ define amdgpu_kernel void @local_agent_monotonic_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_agent_monotonic_seq_cst_cmpxchg: @@ -4701,9 +4701,9 @@ define amdgpu_kernel void @local_agent_monotonic_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_agent_monotonic_seq_cst_cmpxchg: @@ -4715,9 +4715,9 @@ define amdgpu_kernel void @local_agent_monotonic_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_agent_monotonic_seq_cst_cmpxchg: @@ -4743,9 +4743,9 @@ define amdgpu_kernel void @local_agent_monotonic_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_agent_monotonic_seq_cst_cmpxchg: @@ -4842,9 +4842,9 @@ define amdgpu_kernel void @local_agent_acquire_seq_cst_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_agent_acquire_seq_cst_cmpxchg: @@ -4857,9 +4857,9 @@ define amdgpu_kernel void @local_agent_acquire_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_agent_acquire_seq_cst_cmpxchg: @@ -4902,9 +4902,9 @@ define amdgpu_kernel void @local_agent_acquire_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_agent_acquire_seq_cst_cmpxchg: @@ -4916,9 +4916,9 @@ define amdgpu_kernel void @local_agent_acquire_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_agent_acquire_seq_cst_cmpxchg: @@ -4944,9 +4944,9 @@ define amdgpu_kernel void @local_agent_acquire_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_agent_acquire_seq_cst_cmpxchg: @@ -5043,9 +5043,9 @@ define amdgpu_kernel void @local_agent_release_seq_cst_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_agent_release_seq_cst_cmpxchg: @@ -5058,9 +5058,9 @@ define amdgpu_kernel void @local_agent_release_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_agent_release_seq_cst_cmpxchg: @@ -5103,9 +5103,9 @@ define amdgpu_kernel void @local_agent_release_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_agent_release_seq_cst_cmpxchg: @@ -5117,9 +5117,9 @@ define amdgpu_kernel void @local_agent_release_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_agent_release_seq_cst_cmpxchg: @@ -5145,9 +5145,9 @@ define amdgpu_kernel void @local_agent_release_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_agent_release_seq_cst_cmpxchg: @@ -5244,9 +5244,9 @@ define amdgpu_kernel void @local_agent_acq_rel_seq_cst_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_agent_acq_rel_seq_cst_cmpxchg: @@ -5259,9 +5259,9 @@ define amdgpu_kernel void @local_agent_acq_rel_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_agent_acq_rel_seq_cst_cmpxchg: @@ -5304,9 +5304,9 @@ define amdgpu_kernel void @local_agent_acq_rel_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_agent_acq_rel_seq_cst_cmpxchg: @@ -5318,9 +5318,9 @@ define amdgpu_kernel void @local_agent_acq_rel_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_agent_acq_rel_seq_cst_cmpxchg: @@ -5346,9 +5346,9 @@ define amdgpu_kernel void @local_agent_acq_rel_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_agent_acq_rel_seq_cst_cmpxchg: @@ -5445,9 +5445,9 @@ define amdgpu_kernel void @local_agent_seq_cst_seq_cst_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_agent_seq_cst_seq_cst_cmpxchg: @@ -5460,9 +5460,9 @@ define amdgpu_kernel void @local_agent_seq_cst_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_agent_seq_cst_seq_cst_cmpxchg: @@ -5505,9 +5505,9 @@ define amdgpu_kernel void @local_agent_seq_cst_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_agent_seq_cst_seq_cst_cmpxchg: @@ -5519,9 +5519,9 @@ define amdgpu_kernel void @local_agent_seq_cst_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_agent_seq_cst_seq_cst_cmpxchg: @@ -5547,9 +5547,9 @@ define amdgpu_kernel void @local_agent_seq_cst_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_agent_seq_cst_seq_cst_cmpxchg: @@ -5858,7 +5858,7 @@ define amdgpu_kernel void @local_agent_acquire_monotonic_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: ds_write_b32 v0, v1 @@ -5875,7 +5875,7 @@ define amdgpu_kernel void @local_agent_acquire_monotonic_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: ds_write_b32 v0, v1 @@ -5923,7 +5923,7 @@ define amdgpu_kernel void @local_agent_acquire_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 @@ -5939,7 +5939,7 @@ define amdgpu_kernel void @local_agent_acquire_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm @@ -5970,7 +5970,7 @@ define amdgpu_kernel void @local_agent_acquire_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm @@ -6073,7 +6073,7 @@ define amdgpu_kernel void @local_agent_release_monotonic_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 @@ -6091,7 +6091,7 @@ define amdgpu_kernel void @local_agent_release_monotonic_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -6142,7 +6142,7 @@ define amdgpu_kernel void @local_agent_release_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 @@ -6159,7 +6159,7 @@ define amdgpu_kernel void @local_agent_release_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -6191,7 +6191,7 @@ define amdgpu_kernel void @local_agent_release_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -6302,9 +6302,9 @@ define amdgpu_kernel void @local_agent_acq_rel_monotonic_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: ds_write_b32 v0, v1 @@ -6320,9 +6320,9 @@ define amdgpu_kernel void @local_agent_acq_rel_monotonic_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: ds_write_b32 v0, v1 @@ -6372,9 +6372,9 @@ define amdgpu_kernel void @local_agent_acq_rel_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 @@ -6389,9 +6389,9 @@ define amdgpu_kernel void @local_agent_acq_rel_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm @@ -6422,9 +6422,9 @@ define amdgpu_kernel void @local_agent_acq_rel_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm @@ -6536,9 +6536,9 @@ define amdgpu_kernel void @local_agent_seq_cst_monotonic_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: ds_write_b32 v0, v1 @@ -6554,9 +6554,9 @@ define amdgpu_kernel void @local_agent_seq_cst_monotonic_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: ds_write_b32 v0, v1 @@ -6606,9 +6606,9 @@ define amdgpu_kernel void @local_agent_seq_cst_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 @@ -6623,9 +6623,9 @@ define amdgpu_kernel void @local_agent_seq_cst_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm @@ -6656,9 +6656,9 @@ define amdgpu_kernel void @local_agent_seq_cst_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm @@ -6771,7 +6771,7 @@ define amdgpu_kernel void @local_agent_monotonic_acquire_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: ds_write_b32 v0, v1 @@ -6788,7 +6788,7 @@ define amdgpu_kernel void @local_agent_monotonic_acquire_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: ds_write_b32 v0, v1 @@ -6836,7 +6836,7 @@ define amdgpu_kernel void @local_agent_monotonic_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 @@ -6852,7 +6852,7 @@ define amdgpu_kernel void @local_agent_monotonic_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm @@ -6883,7 +6883,7 @@ define amdgpu_kernel void @local_agent_monotonic_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm @@ -6987,7 +6987,7 @@ define amdgpu_kernel void @local_agent_acquire_acquire_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: ds_write_b32 v0, v1 @@ -7004,7 +7004,7 @@ define amdgpu_kernel void @local_agent_acquire_acquire_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: ds_write_b32 v0, v1 @@ -7052,7 +7052,7 @@ define amdgpu_kernel void @local_agent_acquire_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 @@ -7068,7 +7068,7 @@ define amdgpu_kernel void @local_agent_acquire_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm @@ -7099,7 +7099,7 @@ define amdgpu_kernel void @local_agent_acquire_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm @@ -7202,9 +7202,9 @@ define amdgpu_kernel void @local_agent_release_acquire_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: ds_write_b32 v0, v1 @@ -7220,9 +7220,9 @@ define amdgpu_kernel void @local_agent_release_acquire_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: ds_write_b32 v0, v1 @@ -7272,9 +7272,9 @@ define amdgpu_kernel void @local_agent_release_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 @@ -7289,9 +7289,9 @@ define amdgpu_kernel void @local_agent_release_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm @@ -7322,9 +7322,9 @@ define amdgpu_kernel void @local_agent_release_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm @@ -7436,9 +7436,9 @@ define amdgpu_kernel void @local_agent_acq_rel_acquire_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: ds_write_b32 v0, v1 @@ -7454,9 +7454,9 @@ define amdgpu_kernel void @local_agent_acq_rel_acquire_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: ds_write_b32 v0, v1 @@ -7506,9 +7506,9 @@ define amdgpu_kernel void @local_agent_acq_rel_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 @@ -7523,9 +7523,9 @@ define amdgpu_kernel void @local_agent_acq_rel_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm @@ -7556,9 +7556,9 @@ define amdgpu_kernel void @local_agent_acq_rel_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm @@ -7670,9 +7670,9 @@ define amdgpu_kernel void @local_agent_seq_cst_acquire_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: ds_write_b32 v0, v1 @@ -7688,9 +7688,9 @@ define amdgpu_kernel void @local_agent_seq_cst_acquire_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: ds_write_b32 v0, v1 @@ -7740,9 +7740,9 @@ define amdgpu_kernel void @local_agent_seq_cst_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 @@ -7757,9 +7757,9 @@ define amdgpu_kernel void @local_agent_seq_cst_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm @@ -7790,9 +7790,9 @@ define amdgpu_kernel void @local_agent_seq_cst_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm @@ -7904,9 +7904,9 @@ define amdgpu_kernel void @local_agent_monotonic_seq_cst_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: ds_write_b32 v0, v1 @@ -7922,9 +7922,9 @@ define amdgpu_kernel void @local_agent_monotonic_seq_cst_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: ds_write_b32 v0, v1 @@ -7974,9 +7974,9 @@ define amdgpu_kernel void @local_agent_monotonic_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 @@ -7991,9 +7991,9 @@ define amdgpu_kernel void @local_agent_monotonic_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm @@ -8024,9 +8024,9 @@ define amdgpu_kernel void @local_agent_monotonic_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm @@ -8138,9 +8138,9 @@ define amdgpu_kernel void @local_agent_acquire_seq_cst_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: ds_write_b32 v0, v1 @@ -8156,9 +8156,9 @@ define amdgpu_kernel void @local_agent_acquire_seq_cst_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: ds_write_b32 v0, v1 @@ -8208,9 +8208,9 @@ define amdgpu_kernel void @local_agent_acquire_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 @@ -8225,9 +8225,9 @@ define amdgpu_kernel void @local_agent_acquire_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm @@ -8258,9 +8258,9 @@ define amdgpu_kernel void @local_agent_acquire_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm @@ -8372,9 +8372,9 @@ define amdgpu_kernel void @local_agent_release_seq_cst_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: ds_write_b32 v0, v1 @@ -8390,9 +8390,9 @@ define amdgpu_kernel void @local_agent_release_seq_cst_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: ds_write_b32 v0, v1 @@ -8442,9 +8442,9 @@ define amdgpu_kernel void @local_agent_release_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 @@ -8459,9 +8459,9 @@ define amdgpu_kernel void @local_agent_release_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm @@ -8492,9 +8492,9 @@ define amdgpu_kernel void @local_agent_release_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm @@ -8606,9 +8606,9 @@ define amdgpu_kernel void @local_agent_acq_rel_seq_cst_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: ds_write_b32 v0, v1 @@ -8624,9 +8624,9 @@ define amdgpu_kernel void @local_agent_acq_rel_seq_cst_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: ds_write_b32 v0, v1 @@ -8676,9 +8676,9 @@ define amdgpu_kernel void @local_agent_acq_rel_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 @@ -8693,9 +8693,9 @@ define amdgpu_kernel void @local_agent_acq_rel_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm @@ -8726,9 +8726,9 @@ define amdgpu_kernel void @local_agent_acq_rel_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm @@ -8840,9 +8840,9 @@ define amdgpu_kernel void @local_agent_seq_cst_seq_cst_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: ds_write_b32 v0, v1 @@ -8858,9 +8858,9 @@ define amdgpu_kernel void @local_agent_seq_cst_seq_cst_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: ds_write_b32 v0, v1 @@ -8910,9 +8910,9 @@ define amdgpu_kernel void @local_agent_seq_cst_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 @@ -8927,9 +8927,9 @@ define amdgpu_kernel void @local_agent_seq_cst_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm @@ -8960,9 +8960,9 @@ define amdgpu_kernel void @local_agent_seq_cst_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm @@ -9415,6 +9415,7 @@ define amdgpu_kernel void @local_agent_one_as_acquire_load( ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: ds_read_b32 v1, v0 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -9429,6 +9430,7 @@ define amdgpu_kernel void @local_agent_one_as_acquire_load( ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: ds_read_b32 v1, v0 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -9467,6 +9469,7 @@ define amdgpu_kernel void @local_agent_one_as_acquire_load( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: ds_read_b32 v1, v0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -9480,6 +9483,7 @@ define amdgpu_kernel void @local_agent_one_as_acquire_load( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: ds_read_b32 v1, v0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -9504,6 +9508,7 @@ define amdgpu_kernel void @local_agent_one_as_acquire_load( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: ds_read_b32 v1, v0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -9585,7 +9590,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_load( ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_read_b32 v1, v0 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -9599,7 +9606,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_load( ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_read_b32 v1, v0 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -9637,7 +9646,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_load( ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_read_b32 v1, v0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -9650,7 +9661,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_load( ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_read_b32 v1, v0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -9674,7 +9687,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_load( ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_read_b32 v1, v0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -10036,6 +10051,7 @@ define amdgpu_kernel void @local_agent_one_as_release_store( ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm ; @@ -10047,6 +10063,7 @@ define amdgpu_kernel void @local_agent_one_as_release_store( ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: v_mov_b32_e32 v1, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_write_b32 v0, v1 ; GFX7-NEXT: s_endpgm ; @@ -10078,6 +10095,7 @@ define amdgpu_kernel void @local_agent_one_as_release_store( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -10088,6 +10106,7 @@ define amdgpu_kernel void @local_agent_one_as_release_store( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -10108,6 +10127,7 @@ define amdgpu_kernel void @local_agent_one_as_release_store( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -10176,6 +10196,7 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_store( ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm ; @@ -10187,6 +10208,7 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_store( ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: v_mov_b32_e32 v1, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_write_b32 v0, v1 ; GFX7-NEXT: s_endpgm ; @@ -10218,6 +10240,7 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_store( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -10228,6 +10251,7 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_store( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -10248,6 +10272,7 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_store( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -10457,6 +10482,7 @@ define amdgpu_kernel void @local_agent_one_as_acquire_atomicrmw( ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_agent_one_as_acquire_atomicrmw: @@ -10468,6 +10494,7 @@ define amdgpu_kernel void @local_agent_one_as_acquire_atomicrmw( ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: v_mov_b32_e32 v1, s4 ; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_agent_one_as_acquire_atomicrmw: @@ -10499,6 +10526,7 @@ define amdgpu_kernel void @local_agent_one_as_acquire_atomicrmw( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_acquire_atomicrmw: @@ -10509,6 +10537,7 @@ define amdgpu_kernel void @local_agent_one_as_acquire_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_agent_one_as_acquire_atomicrmw: @@ -10529,6 +10558,7 @@ define amdgpu_kernel void @local_agent_one_as_acquire_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_agent_one_as_acquire_atomicrmw: @@ -10596,6 +10626,7 @@ define amdgpu_kernel void @local_agent_one_as_release_atomicrmw( ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX6-NEXT: s_endpgm ; @@ -10607,6 +10638,7 @@ define amdgpu_kernel void @local_agent_one_as_release_atomicrmw( ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: v_mov_b32_e32 v1, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX7-NEXT: s_endpgm ; @@ -10638,6 +10670,7 @@ define amdgpu_kernel void @local_agent_one_as_release_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -10648,6 +10681,7 @@ define amdgpu_kernel void @local_agent_one_as_release_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -10668,6 +10702,7 @@ define amdgpu_kernel void @local_agent_one_as_release_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -10736,7 +10771,9 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_atomicrmw( ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_agent_one_as_acq_rel_atomicrmw: @@ -10747,7 +10784,9 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_atomicrmw( ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: v_mov_b32_e32 v1, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_agent_one_as_acq_rel_atomicrmw: @@ -10778,7 +10817,9 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_acq_rel_atomicrmw: @@ -10788,7 +10829,9 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_agent_one_as_acq_rel_atomicrmw: @@ -10808,7 +10851,9 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_agent_one_as_acq_rel_atomicrmw: @@ -10876,7 +10921,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_atomicrmw( ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_agent_one_as_seq_cst_atomicrmw: @@ -10887,7 +10934,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_atomicrmw( ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: v_mov_b32_e32 v1, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_agent_one_as_seq_cst_atomicrmw: @@ -10918,7 +10967,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_seq_cst_atomicrmw: @@ -10928,7 +10979,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_agent_one_as_seq_cst_atomicrmw: @@ -10948,7 +11001,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_agent_one_as_seq_cst_atomicrmw: @@ -11017,6 +11072,7 @@ define amdgpu_kernel void @local_agent_one_as_acquire_ret_atomicrmw( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -11032,6 +11088,7 @@ define amdgpu_kernel void @local_agent_one_as_acquire_ret_atomicrmw( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -11073,6 +11130,7 @@ define amdgpu_kernel void @local_agent_one_as_acquire_ret_atomicrmw( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -11087,6 +11145,7 @@ define amdgpu_kernel void @local_agent_one_as_acquire_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -11113,6 +11172,7 @@ define amdgpu_kernel void @local_agent_one_as_acquire_ret_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -11199,7 +11259,9 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_ret_atomicrmw( ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -11214,7 +11276,9 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_ret_atomicrmw( ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -11255,7 +11319,9 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_ret_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -11269,7 +11335,9 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -11295,7 +11363,9 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_ret_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -11382,7 +11452,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_ret_atomicrmw( ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -11397,7 +11469,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_ret_atomicrmw( ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -11438,7 +11512,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_ret_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -11452,7 +11528,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -11478,7 +11556,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_ret_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -11735,6 +11815,7 @@ define amdgpu_kernel void @local_agent_one_as_acquire_monotonic_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_agent_one_as_acquire_monotonic_cmpxchg: @@ -11748,6 +11829,7 @@ define amdgpu_kernel void @local_agent_one_as_acquire_monotonic_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_agent_one_as_acquire_monotonic_cmpxchg: @@ -11785,6 +11867,7 @@ define amdgpu_kernel void @local_agent_one_as_acquire_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_acquire_monotonic_cmpxchg: @@ -11797,6 +11880,7 @@ define amdgpu_kernel void @local_agent_one_as_acquire_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_agent_one_as_acquire_monotonic_cmpxchg: @@ -11821,6 +11905,7 @@ define amdgpu_kernel void @local_agent_one_as_acquire_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_agent_one_as_acquire_monotonic_cmpxchg: @@ -11901,6 +11986,7 @@ define amdgpu_kernel void @local_agent_one_as_release_monotonic_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX6-NEXT: s_endpgm ; @@ -11914,6 +12000,7 @@ define amdgpu_kernel void @local_agent_one_as_release_monotonic_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX7-NEXT: s_endpgm ; @@ -11951,6 +12038,7 @@ define amdgpu_kernel void @local_agent_one_as_release_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -11963,6 +12051,7 @@ define amdgpu_kernel void @local_agent_one_as_release_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -11987,6 +12076,7 @@ define amdgpu_kernel void @local_agent_one_as_release_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -12068,7 +12158,9 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_monotonic_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_agent_one_as_acq_rel_monotonic_cmpxchg: @@ -12081,7 +12173,9 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_monotonic_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_agent_one_as_acq_rel_monotonic_cmpxchg: @@ -12118,7 +12212,9 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_acq_rel_monotonic_cmpxchg: @@ -12130,7 +12226,9 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_agent_one_as_acq_rel_monotonic_cmpxchg: @@ -12154,7 +12252,9 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_agent_one_as_acq_rel_monotonic_cmpxchg: @@ -12235,7 +12335,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_monotonic_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_agent_one_as_seq_cst_monotonic_cmpxchg: @@ -12248,7 +12350,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_monotonic_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_agent_one_as_seq_cst_monotonic_cmpxchg: @@ -12285,7 +12389,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_seq_cst_monotonic_cmpxchg: @@ -12297,7 +12403,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_agent_one_as_seq_cst_monotonic_cmpxchg: @@ -12321,7 +12429,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_agent_one_as_seq_cst_monotonic_cmpxchg: @@ -12403,6 +12513,7 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_acquire_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_agent_one_as_monotonic_acquire_cmpxchg: @@ -12416,6 +12527,7 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_acquire_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_agent_one_as_monotonic_acquire_cmpxchg: @@ -12453,6 +12565,7 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_monotonic_acquire_cmpxchg: @@ -12465,6 +12578,7 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_agent_one_as_monotonic_acquire_cmpxchg: @@ -12489,6 +12603,7 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_agent_one_as_monotonic_acquire_cmpxchg: @@ -12570,6 +12685,7 @@ define amdgpu_kernel void @local_agent_one_as_acquire_acquire_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_agent_one_as_acquire_acquire_cmpxchg: @@ -12583,6 +12699,7 @@ define amdgpu_kernel void @local_agent_one_as_acquire_acquire_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_agent_one_as_acquire_acquire_cmpxchg: @@ -12620,6 +12737,7 @@ define amdgpu_kernel void @local_agent_one_as_acquire_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_acquire_acquire_cmpxchg: @@ -12632,6 +12750,7 @@ define amdgpu_kernel void @local_agent_one_as_acquire_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_agent_one_as_acquire_acquire_cmpxchg: @@ -12656,6 +12775,7 @@ define amdgpu_kernel void @local_agent_one_as_acquire_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_agent_one_as_acquire_acquire_cmpxchg: @@ -12736,7 +12856,9 @@ define amdgpu_kernel void @local_agent_one_as_release_acquire_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_agent_one_as_release_acquire_cmpxchg: @@ -12749,7 +12871,9 @@ define amdgpu_kernel void @local_agent_one_as_release_acquire_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_agent_one_as_release_acquire_cmpxchg: @@ -12786,7 +12910,9 @@ define amdgpu_kernel void @local_agent_one_as_release_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_release_acquire_cmpxchg: @@ -12798,7 +12924,9 @@ define amdgpu_kernel void @local_agent_one_as_release_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_agent_one_as_release_acquire_cmpxchg: @@ -12822,7 +12950,9 @@ define amdgpu_kernel void @local_agent_one_as_release_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_agent_one_as_release_acquire_cmpxchg: @@ -12903,7 +13033,9 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_acquire_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_agent_one_as_acq_rel_acquire_cmpxchg: @@ -12916,7 +13048,9 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_acquire_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_agent_one_as_acq_rel_acquire_cmpxchg: @@ -12953,7 +13087,9 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_acq_rel_acquire_cmpxchg: @@ -12965,7 +13101,9 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_agent_one_as_acq_rel_acquire_cmpxchg: @@ -12989,7 +13127,9 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_agent_one_as_acq_rel_acquire_cmpxchg: @@ -13070,7 +13210,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_acquire_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_agent_one_as_seq_cst_acquire_cmpxchg: @@ -13083,7 +13225,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_acquire_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_agent_one_as_seq_cst_acquire_cmpxchg: @@ -13120,7 +13264,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_seq_cst_acquire_cmpxchg: @@ -13132,7 +13278,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_agent_one_as_seq_cst_acquire_cmpxchg: @@ -13156,7 +13304,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_agent_one_as_seq_cst_acquire_cmpxchg: @@ -13237,7 +13387,9 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_seq_cst_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_agent_one_as_monotonic_seq_cst_cmpxchg: @@ -13250,7 +13402,9 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_agent_one_as_monotonic_seq_cst_cmpxchg: @@ -13287,7 +13441,9 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_monotonic_seq_cst_cmpxchg: @@ -13299,7 +13455,9 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_agent_one_as_monotonic_seq_cst_cmpxchg: @@ -13323,7 +13481,9 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_agent_one_as_monotonic_seq_cst_cmpxchg: @@ -13404,7 +13564,9 @@ define amdgpu_kernel void @local_agent_one_as_acquire_seq_cst_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_agent_one_as_acquire_seq_cst_cmpxchg: @@ -13417,7 +13579,9 @@ define amdgpu_kernel void @local_agent_one_as_acquire_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_agent_one_as_acquire_seq_cst_cmpxchg: @@ -13454,7 +13618,9 @@ define amdgpu_kernel void @local_agent_one_as_acquire_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_acquire_seq_cst_cmpxchg: @@ -13466,7 +13632,9 @@ define amdgpu_kernel void @local_agent_one_as_acquire_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_agent_one_as_acquire_seq_cst_cmpxchg: @@ -13490,7 +13658,9 @@ define amdgpu_kernel void @local_agent_one_as_acquire_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_agent_one_as_acquire_seq_cst_cmpxchg: @@ -13571,7 +13741,9 @@ define amdgpu_kernel void @local_agent_one_as_release_seq_cst_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_agent_one_as_release_seq_cst_cmpxchg: @@ -13584,7 +13756,9 @@ define amdgpu_kernel void @local_agent_one_as_release_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_agent_one_as_release_seq_cst_cmpxchg: @@ -13621,7 +13795,9 @@ define amdgpu_kernel void @local_agent_one_as_release_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_release_seq_cst_cmpxchg: @@ -13633,7 +13809,9 @@ define amdgpu_kernel void @local_agent_one_as_release_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_agent_one_as_release_seq_cst_cmpxchg: @@ -13657,7 +13835,9 @@ define amdgpu_kernel void @local_agent_one_as_release_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_agent_one_as_release_seq_cst_cmpxchg: @@ -13738,7 +13918,9 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_seq_cst_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_agent_one_as_acq_rel_seq_cst_cmpxchg: @@ -13751,7 +13933,9 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_agent_one_as_acq_rel_seq_cst_cmpxchg: @@ -13788,7 +13972,9 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_acq_rel_seq_cst_cmpxchg: @@ -13800,7 +13986,9 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_agent_one_as_acq_rel_seq_cst_cmpxchg: @@ -13824,7 +14012,9 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_agent_one_as_acq_rel_seq_cst_cmpxchg: @@ -13905,7 +14095,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_seq_cst_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_agent_one_as_seq_cst_seq_cst_cmpxchg: @@ -13918,7 +14110,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_agent_one_as_seq_cst_seq_cst_cmpxchg: @@ -13955,7 +14149,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_seq_cst_seq_cst_cmpxchg: @@ -13967,7 +14163,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_agent_one_as_seq_cst_seq_cst_cmpxchg: @@ -13991,7 +14189,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_agent_one_as_seq_cst_seq_cst_cmpxchg: @@ -14284,6 +14484,7 @@ define amdgpu_kernel void @local_agent_one_as_acquire_monotonic_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -14301,6 +14502,7 @@ define amdgpu_kernel void @local_agent_one_as_acquire_monotonic_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -14348,6 +14550,7 @@ define amdgpu_kernel void @local_agent_one_as_acquire_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -14364,6 +14567,7 @@ define amdgpu_kernel void @local_agent_one_as_acquire_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -14394,6 +14598,7 @@ define amdgpu_kernel void @local_agent_one_as_acquire_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -14494,6 +14699,7 @@ define amdgpu_kernel void @local_agent_one_as_release_monotonic_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 @@ -14511,6 +14717,7 @@ define amdgpu_kernel void @local_agent_one_as_release_monotonic_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -14558,6 +14765,7 @@ define amdgpu_kernel void @local_agent_one_as_release_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 @@ -14574,6 +14782,7 @@ define amdgpu_kernel void @local_agent_one_as_release_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -14604,6 +14813,7 @@ define amdgpu_kernel void @local_agent_one_as_release_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -14705,7 +14915,9 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -14722,7 +14934,9 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -14769,7 +14983,9 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -14785,7 +15001,9 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -14815,7 +15033,9 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -14916,7 +15136,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -14933,7 +15155,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -14980,7 +15204,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -14996,7 +15222,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -15026,7 +15254,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -15128,6 +15358,7 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_acquire_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -15145,6 +15376,7 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_acquire_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -15192,6 +15424,7 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -15208,6 +15441,7 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -15238,6 +15472,7 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -15339,6 +15574,7 @@ define amdgpu_kernel void @local_agent_one_as_acquire_acquire_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -15356,6 +15592,7 @@ define amdgpu_kernel void @local_agent_one_as_acquire_acquire_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -15403,6 +15640,7 @@ define amdgpu_kernel void @local_agent_one_as_acquire_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -15419,6 +15657,7 @@ define amdgpu_kernel void @local_agent_one_as_acquire_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -15449,6 +15688,7 @@ define amdgpu_kernel void @local_agent_one_as_acquire_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -15549,7 +15789,9 @@ define amdgpu_kernel void @local_agent_one_as_release_acquire_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -15566,7 +15808,9 @@ define amdgpu_kernel void @local_agent_one_as_release_acquire_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -15613,7 +15857,9 @@ define amdgpu_kernel void @local_agent_one_as_release_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -15629,7 +15875,9 @@ define amdgpu_kernel void @local_agent_one_as_release_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -15659,7 +15907,9 @@ define amdgpu_kernel void @local_agent_one_as_release_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -15760,7 +16010,9 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -15777,7 +16029,9 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -15824,7 +16078,9 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -15840,7 +16096,9 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -15870,7 +16128,9 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -15971,7 +16231,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -15988,7 +16250,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -16035,7 +16299,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -16051,7 +16317,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -16081,7 +16349,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -16182,7 +16452,9 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -16199,7 +16471,9 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -16246,7 +16520,9 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -16262,7 +16538,9 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -16292,7 +16570,9 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -16393,7 +16673,9 @@ define amdgpu_kernel void @local_agent_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -16410,7 +16692,9 @@ define amdgpu_kernel void @local_agent_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -16457,7 +16741,9 @@ define amdgpu_kernel void @local_agent_one_as_acquire_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -16473,7 +16759,9 @@ define amdgpu_kernel void @local_agent_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -16503,7 +16791,9 @@ define amdgpu_kernel void @local_agent_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -16604,7 +16894,9 @@ define amdgpu_kernel void @local_agent_one_as_release_seq_cst_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -16621,7 +16913,9 @@ define amdgpu_kernel void @local_agent_one_as_release_seq_cst_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -16668,7 +16962,9 @@ define amdgpu_kernel void @local_agent_one_as_release_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -16684,7 +16980,9 @@ define amdgpu_kernel void @local_agent_one_as_release_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -16714,7 +17012,9 @@ define amdgpu_kernel void @local_agent_one_as_release_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -16815,7 +17115,9 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -16832,7 +17134,9 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -16879,7 +17183,9 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -16895,7 +17201,9 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -16925,7 +17233,9 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -17026,7 +17336,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -17043,7 +17355,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -17090,7 +17404,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -17106,7 +17422,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -17136,7 +17454,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-nontemporal.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-nontemporal.ll index 78209ee34cad4..48b3b26eae084 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-nontemporal.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-nontemporal.ll @@ -830,7 +830,7 @@ define amdgpu_kernel void @local_nontemporal_volatile_load( ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s8 ; GFX6-NEXT: ds_read_b32 v0, v0 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; @@ -845,6 +845,7 @@ define amdgpu_kernel void @local_nontemporal_volatile_load( ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ds_read_b32 v2, v0 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -892,7 +893,7 @@ define amdgpu_kernel void @local_nontemporal_volatile_load( ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 ; SKIP-CACHE-INV-NEXT: ds_read_b32 v0, v0 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -904,7 +905,7 @@ define amdgpu_kernel void @local_nontemporal_volatile_load( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ds_read_b32 v1, v1 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -928,7 +929,7 @@ define amdgpu_kernel void @local_nontemporal_volatile_load( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ds_read_b32 v1, v1 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-singlethread.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-singlethread.ll index f84d451f8ecb0..b240f51241a04 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-singlethread.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-singlethread.ll @@ -366,6 +366,7 @@ define amdgpu_kernel void @local_singlethread_acquire_load( ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: ds_read_b32 v1, v0 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -380,6 +381,7 @@ define amdgpu_kernel void @local_singlethread_acquire_load( ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: ds_read_b32 v1, v0 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -418,6 +420,7 @@ define amdgpu_kernel void @local_singlethread_acquire_load( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: ds_read_b32 v1, v0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -431,6 +434,7 @@ define amdgpu_kernel void @local_singlethread_acquire_load( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: ds_read_b32 v1, v0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -455,6 +459,7 @@ define amdgpu_kernel void @local_singlethread_acquire_load( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: ds_read_b32 v1, v0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -536,7 +541,9 @@ define amdgpu_kernel void @local_singlethread_seq_cst_load( ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_read_b32 v1, v0 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -550,7 +557,9 @@ define amdgpu_kernel void @local_singlethread_seq_cst_load( ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_read_b32 v1, v0 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -588,7 +597,9 @@ define amdgpu_kernel void @local_singlethread_seq_cst_load( ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_read_b32 v1, v0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -601,7 +612,9 @@ define amdgpu_kernel void @local_singlethread_seq_cst_load( ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_read_b32 v1, v0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -625,7 +638,9 @@ define amdgpu_kernel void @local_singlethread_seq_cst_load( ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_read_b32 v1, v0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -987,6 +1002,7 @@ define amdgpu_kernel void @local_singlethread_release_store( ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm ; @@ -998,6 +1014,7 @@ define amdgpu_kernel void @local_singlethread_release_store( ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: v_mov_b32_e32 v1, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_write_b32 v0, v1 ; GFX7-NEXT: s_endpgm ; @@ -1029,6 +1046,7 @@ define amdgpu_kernel void @local_singlethread_release_store( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -1039,6 +1057,7 @@ define amdgpu_kernel void @local_singlethread_release_store( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -1059,6 +1078,7 @@ define amdgpu_kernel void @local_singlethread_release_store( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -1127,6 +1147,7 @@ define amdgpu_kernel void @local_singlethread_seq_cst_store( ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm ; @@ -1138,6 +1159,7 @@ define amdgpu_kernel void @local_singlethread_seq_cst_store( ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: v_mov_b32_e32 v1, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_write_b32 v0, v1 ; GFX7-NEXT: s_endpgm ; @@ -1169,6 +1191,7 @@ define amdgpu_kernel void @local_singlethread_seq_cst_store( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -1179,6 +1202,7 @@ define amdgpu_kernel void @local_singlethread_seq_cst_store( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -1199,6 +1223,7 @@ define amdgpu_kernel void @local_singlethread_seq_cst_store( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -1408,6 +1433,7 @@ define amdgpu_kernel void @local_singlethread_acquire_atomicrmw( ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_singlethread_acquire_atomicrmw: @@ -1419,6 +1445,7 @@ define amdgpu_kernel void @local_singlethread_acquire_atomicrmw( ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: v_mov_b32_e32 v1, s4 ; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_singlethread_acquire_atomicrmw: @@ -1450,6 +1477,7 @@ define amdgpu_kernel void @local_singlethread_acquire_atomicrmw( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_acquire_atomicrmw: @@ -1460,6 +1488,7 @@ define amdgpu_kernel void @local_singlethread_acquire_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_singlethread_acquire_atomicrmw: @@ -1480,6 +1509,7 @@ define amdgpu_kernel void @local_singlethread_acquire_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_singlethread_acquire_atomicrmw: @@ -1547,6 +1577,7 @@ define amdgpu_kernel void @local_singlethread_release_atomicrmw( ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX6-NEXT: s_endpgm ; @@ -1558,6 +1589,7 @@ define amdgpu_kernel void @local_singlethread_release_atomicrmw( ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: v_mov_b32_e32 v1, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX7-NEXT: s_endpgm ; @@ -1589,6 +1621,7 @@ define amdgpu_kernel void @local_singlethread_release_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -1599,6 +1632,7 @@ define amdgpu_kernel void @local_singlethread_release_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -1619,6 +1653,7 @@ define amdgpu_kernel void @local_singlethread_release_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -1687,7 +1722,9 @@ define amdgpu_kernel void @local_singlethread_acq_rel_atomicrmw( ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_singlethread_acq_rel_atomicrmw: @@ -1698,7 +1735,9 @@ define amdgpu_kernel void @local_singlethread_acq_rel_atomicrmw( ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: v_mov_b32_e32 v1, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_singlethread_acq_rel_atomicrmw: @@ -1729,7 +1768,9 @@ define amdgpu_kernel void @local_singlethread_acq_rel_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_acq_rel_atomicrmw: @@ -1739,7 +1780,9 @@ define amdgpu_kernel void @local_singlethread_acq_rel_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_singlethread_acq_rel_atomicrmw: @@ -1759,7 +1802,9 @@ define amdgpu_kernel void @local_singlethread_acq_rel_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_singlethread_acq_rel_atomicrmw: @@ -1827,7 +1872,9 @@ define amdgpu_kernel void @local_singlethread_seq_cst_atomicrmw( ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_singlethread_seq_cst_atomicrmw: @@ -1838,7 +1885,9 @@ define amdgpu_kernel void @local_singlethread_seq_cst_atomicrmw( ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: v_mov_b32_e32 v1, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_singlethread_seq_cst_atomicrmw: @@ -1869,7 +1918,9 @@ define amdgpu_kernel void @local_singlethread_seq_cst_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_seq_cst_atomicrmw: @@ -1879,7 +1930,9 @@ define amdgpu_kernel void @local_singlethread_seq_cst_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_singlethread_seq_cst_atomicrmw: @@ -1899,7 +1952,9 @@ define amdgpu_kernel void @local_singlethread_seq_cst_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_singlethread_seq_cst_atomicrmw: @@ -1968,6 +2023,7 @@ define amdgpu_kernel void @local_singlethread_acquire_ret_atomicrmw( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -1983,6 +2039,7 @@ define amdgpu_kernel void @local_singlethread_acquire_ret_atomicrmw( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -2024,6 +2081,7 @@ define amdgpu_kernel void @local_singlethread_acquire_ret_atomicrmw( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -2038,6 +2096,7 @@ define amdgpu_kernel void @local_singlethread_acquire_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -2064,6 +2123,7 @@ define amdgpu_kernel void @local_singlethread_acquire_ret_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -2150,7 +2210,9 @@ define amdgpu_kernel void @local_singlethread_acq_rel_ret_atomicrmw( ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -2165,7 +2227,9 @@ define amdgpu_kernel void @local_singlethread_acq_rel_ret_atomicrmw( ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -2206,7 +2270,9 @@ define amdgpu_kernel void @local_singlethread_acq_rel_ret_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -2220,7 +2286,9 @@ define amdgpu_kernel void @local_singlethread_acq_rel_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -2246,7 +2314,9 @@ define amdgpu_kernel void @local_singlethread_acq_rel_ret_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -2333,7 +2403,9 @@ define amdgpu_kernel void @local_singlethread_seq_cst_ret_atomicrmw( ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -2348,7 +2420,9 @@ define amdgpu_kernel void @local_singlethread_seq_cst_ret_atomicrmw( ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -2389,7 +2463,9 @@ define amdgpu_kernel void @local_singlethread_seq_cst_ret_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -2403,7 +2479,9 @@ define amdgpu_kernel void @local_singlethread_seq_cst_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -2429,7 +2507,9 @@ define amdgpu_kernel void @local_singlethread_seq_cst_ret_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -2686,6 +2766,7 @@ define amdgpu_kernel void @local_singlethread_acquire_monotonic_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_singlethread_acquire_monotonic_cmpxchg: @@ -2699,6 +2780,7 @@ define amdgpu_kernel void @local_singlethread_acquire_monotonic_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_singlethread_acquire_monotonic_cmpxchg: @@ -2736,6 +2818,7 @@ define amdgpu_kernel void @local_singlethread_acquire_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_acquire_monotonic_cmpxchg: @@ -2748,6 +2831,7 @@ define amdgpu_kernel void @local_singlethread_acquire_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_singlethread_acquire_monotonic_cmpxchg: @@ -2772,6 +2856,7 @@ define amdgpu_kernel void @local_singlethread_acquire_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_singlethread_acquire_monotonic_cmpxchg: @@ -2852,6 +2937,7 @@ define amdgpu_kernel void @local_singlethread_release_monotonic_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX6-NEXT: s_endpgm ; @@ -2865,6 +2951,7 @@ define amdgpu_kernel void @local_singlethread_release_monotonic_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX7-NEXT: s_endpgm ; @@ -2902,6 +2989,7 @@ define amdgpu_kernel void @local_singlethread_release_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -2914,6 +3002,7 @@ define amdgpu_kernel void @local_singlethread_release_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -2938,6 +3027,7 @@ define amdgpu_kernel void @local_singlethread_release_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -3019,7 +3109,9 @@ define amdgpu_kernel void @local_singlethread_acq_rel_monotonic_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_singlethread_acq_rel_monotonic_cmpxchg: @@ -3032,7 +3124,9 @@ define amdgpu_kernel void @local_singlethread_acq_rel_monotonic_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_singlethread_acq_rel_monotonic_cmpxchg: @@ -3069,7 +3163,9 @@ define amdgpu_kernel void @local_singlethread_acq_rel_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_acq_rel_monotonic_cmpxchg: @@ -3081,7 +3177,9 @@ define amdgpu_kernel void @local_singlethread_acq_rel_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_singlethread_acq_rel_monotonic_cmpxchg: @@ -3105,7 +3203,9 @@ define amdgpu_kernel void @local_singlethread_acq_rel_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_singlethread_acq_rel_monotonic_cmpxchg: @@ -3186,7 +3286,9 @@ define amdgpu_kernel void @local_singlethread_seq_cst_monotonic_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_singlethread_seq_cst_monotonic_cmpxchg: @@ -3199,7 +3301,9 @@ define amdgpu_kernel void @local_singlethread_seq_cst_monotonic_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_singlethread_seq_cst_monotonic_cmpxchg: @@ -3236,7 +3340,9 @@ define amdgpu_kernel void @local_singlethread_seq_cst_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_seq_cst_monotonic_cmpxchg: @@ -3248,7 +3354,9 @@ define amdgpu_kernel void @local_singlethread_seq_cst_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_singlethread_seq_cst_monotonic_cmpxchg: @@ -3272,7 +3380,9 @@ define amdgpu_kernel void @local_singlethread_seq_cst_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_singlethread_seq_cst_monotonic_cmpxchg: @@ -3354,6 +3464,7 @@ define amdgpu_kernel void @local_singlethread_monotonic_acquire_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_singlethread_monotonic_acquire_cmpxchg: @@ -3367,6 +3478,7 @@ define amdgpu_kernel void @local_singlethread_monotonic_acquire_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_singlethread_monotonic_acquire_cmpxchg: @@ -3404,6 +3516,7 @@ define amdgpu_kernel void @local_singlethread_monotonic_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_monotonic_acquire_cmpxchg: @@ -3416,6 +3529,7 @@ define amdgpu_kernel void @local_singlethread_monotonic_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_singlethread_monotonic_acquire_cmpxchg: @@ -3440,6 +3554,7 @@ define amdgpu_kernel void @local_singlethread_monotonic_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_singlethread_monotonic_acquire_cmpxchg: @@ -3521,6 +3636,7 @@ define amdgpu_kernel void @local_singlethread_acquire_acquire_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_singlethread_acquire_acquire_cmpxchg: @@ -3534,6 +3650,7 @@ define amdgpu_kernel void @local_singlethread_acquire_acquire_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_singlethread_acquire_acquire_cmpxchg: @@ -3571,6 +3688,7 @@ define amdgpu_kernel void @local_singlethread_acquire_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_acquire_acquire_cmpxchg: @@ -3583,6 +3701,7 @@ define amdgpu_kernel void @local_singlethread_acquire_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_singlethread_acquire_acquire_cmpxchg: @@ -3607,6 +3726,7 @@ define amdgpu_kernel void @local_singlethread_acquire_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_singlethread_acquire_acquire_cmpxchg: @@ -3687,7 +3807,9 @@ define amdgpu_kernel void @local_singlethread_release_acquire_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_singlethread_release_acquire_cmpxchg: @@ -3700,7 +3822,9 @@ define amdgpu_kernel void @local_singlethread_release_acquire_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_singlethread_release_acquire_cmpxchg: @@ -3737,7 +3861,9 @@ define amdgpu_kernel void @local_singlethread_release_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_release_acquire_cmpxchg: @@ -3749,7 +3875,9 @@ define amdgpu_kernel void @local_singlethread_release_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_singlethread_release_acquire_cmpxchg: @@ -3773,7 +3901,9 @@ define amdgpu_kernel void @local_singlethread_release_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_singlethread_release_acquire_cmpxchg: @@ -3854,7 +3984,9 @@ define amdgpu_kernel void @local_singlethread_acq_rel_acquire_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_singlethread_acq_rel_acquire_cmpxchg: @@ -3867,7 +3999,9 @@ define amdgpu_kernel void @local_singlethread_acq_rel_acquire_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_singlethread_acq_rel_acquire_cmpxchg: @@ -3904,7 +4038,9 @@ define amdgpu_kernel void @local_singlethread_acq_rel_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_acq_rel_acquire_cmpxchg: @@ -3916,7 +4052,9 @@ define amdgpu_kernel void @local_singlethread_acq_rel_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_singlethread_acq_rel_acquire_cmpxchg: @@ -3940,7 +4078,9 @@ define amdgpu_kernel void @local_singlethread_acq_rel_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_singlethread_acq_rel_acquire_cmpxchg: @@ -4021,7 +4161,9 @@ define amdgpu_kernel void @local_singlethread_seq_cst_acquire_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_singlethread_seq_cst_acquire_cmpxchg: @@ -4034,7 +4176,9 @@ define amdgpu_kernel void @local_singlethread_seq_cst_acquire_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_singlethread_seq_cst_acquire_cmpxchg: @@ -4071,7 +4215,9 @@ define amdgpu_kernel void @local_singlethread_seq_cst_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_seq_cst_acquire_cmpxchg: @@ -4083,7 +4229,9 @@ define amdgpu_kernel void @local_singlethread_seq_cst_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_singlethread_seq_cst_acquire_cmpxchg: @@ -4107,7 +4255,9 @@ define amdgpu_kernel void @local_singlethread_seq_cst_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_singlethread_seq_cst_acquire_cmpxchg: @@ -4188,7 +4338,9 @@ define amdgpu_kernel void @local_singlethread_monotonic_seq_cst_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_singlethread_monotonic_seq_cst_cmpxchg: @@ -4201,7 +4353,9 @@ define amdgpu_kernel void @local_singlethread_monotonic_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_singlethread_monotonic_seq_cst_cmpxchg: @@ -4238,7 +4392,9 @@ define amdgpu_kernel void @local_singlethread_monotonic_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_monotonic_seq_cst_cmpxchg: @@ -4250,7 +4406,9 @@ define amdgpu_kernel void @local_singlethread_monotonic_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_singlethread_monotonic_seq_cst_cmpxchg: @@ -4274,7 +4432,9 @@ define amdgpu_kernel void @local_singlethread_monotonic_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_singlethread_monotonic_seq_cst_cmpxchg: @@ -4355,7 +4515,9 @@ define amdgpu_kernel void @local_singlethread_acquire_seq_cst_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_singlethread_acquire_seq_cst_cmpxchg: @@ -4368,7 +4530,9 @@ define amdgpu_kernel void @local_singlethread_acquire_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_singlethread_acquire_seq_cst_cmpxchg: @@ -4405,7 +4569,9 @@ define amdgpu_kernel void @local_singlethread_acquire_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_acquire_seq_cst_cmpxchg: @@ -4417,7 +4583,9 @@ define amdgpu_kernel void @local_singlethread_acquire_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_singlethread_acquire_seq_cst_cmpxchg: @@ -4441,7 +4609,9 @@ define amdgpu_kernel void @local_singlethread_acquire_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_singlethread_acquire_seq_cst_cmpxchg: @@ -4522,7 +4692,9 @@ define amdgpu_kernel void @local_singlethread_release_seq_cst_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_singlethread_release_seq_cst_cmpxchg: @@ -4535,7 +4707,9 @@ define amdgpu_kernel void @local_singlethread_release_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_singlethread_release_seq_cst_cmpxchg: @@ -4572,7 +4746,9 @@ define amdgpu_kernel void @local_singlethread_release_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_release_seq_cst_cmpxchg: @@ -4584,7 +4760,9 @@ define amdgpu_kernel void @local_singlethread_release_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_singlethread_release_seq_cst_cmpxchg: @@ -4608,7 +4786,9 @@ define amdgpu_kernel void @local_singlethread_release_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_singlethread_release_seq_cst_cmpxchg: @@ -4689,7 +4869,9 @@ define amdgpu_kernel void @local_singlethread_acq_rel_seq_cst_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_singlethread_acq_rel_seq_cst_cmpxchg: @@ -4702,7 +4884,9 @@ define amdgpu_kernel void @local_singlethread_acq_rel_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_singlethread_acq_rel_seq_cst_cmpxchg: @@ -4739,7 +4923,9 @@ define amdgpu_kernel void @local_singlethread_acq_rel_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_acq_rel_seq_cst_cmpxchg: @@ -4751,7 +4937,9 @@ define amdgpu_kernel void @local_singlethread_acq_rel_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_singlethread_acq_rel_seq_cst_cmpxchg: @@ -4775,7 +4963,9 @@ define amdgpu_kernel void @local_singlethread_acq_rel_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_singlethread_acq_rel_seq_cst_cmpxchg: @@ -4856,7 +5046,9 @@ define amdgpu_kernel void @local_singlethread_seq_cst_seq_cst_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_singlethread_seq_cst_seq_cst_cmpxchg: @@ -4869,7 +5061,9 @@ define amdgpu_kernel void @local_singlethread_seq_cst_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_singlethread_seq_cst_seq_cst_cmpxchg: @@ -4906,7 +5100,9 @@ define amdgpu_kernel void @local_singlethread_seq_cst_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_seq_cst_seq_cst_cmpxchg: @@ -4918,7 +5114,9 @@ define amdgpu_kernel void @local_singlethread_seq_cst_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_singlethread_seq_cst_seq_cst_cmpxchg: @@ -4942,7 +5140,9 @@ define amdgpu_kernel void @local_singlethread_seq_cst_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_singlethread_seq_cst_seq_cst_cmpxchg: @@ -5235,6 +5435,7 @@ define amdgpu_kernel void @local_singlethread_acquire_monotonic_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -5252,6 +5453,7 @@ define amdgpu_kernel void @local_singlethread_acquire_monotonic_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -5299,6 +5501,7 @@ define amdgpu_kernel void @local_singlethread_acquire_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -5315,6 +5518,7 @@ define amdgpu_kernel void @local_singlethread_acquire_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -5345,6 +5549,7 @@ define amdgpu_kernel void @local_singlethread_acquire_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -5445,6 +5650,7 @@ define amdgpu_kernel void @local_singlethread_release_monotonic_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 @@ -5462,6 +5668,7 @@ define amdgpu_kernel void @local_singlethread_release_monotonic_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -5509,6 +5716,7 @@ define amdgpu_kernel void @local_singlethread_release_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 @@ -5525,6 +5733,7 @@ define amdgpu_kernel void @local_singlethread_release_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -5555,6 +5764,7 @@ define amdgpu_kernel void @local_singlethread_release_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -5656,7 +5866,9 @@ define amdgpu_kernel void @local_singlethread_acq_rel_monotonic_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -5673,7 +5885,9 @@ define amdgpu_kernel void @local_singlethread_acq_rel_monotonic_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -5720,7 +5934,9 @@ define amdgpu_kernel void @local_singlethread_acq_rel_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -5736,7 +5952,9 @@ define amdgpu_kernel void @local_singlethread_acq_rel_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -5766,7 +5984,9 @@ define amdgpu_kernel void @local_singlethread_acq_rel_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -5867,7 +6087,9 @@ define amdgpu_kernel void @local_singlethread_seq_cst_monotonic_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -5884,7 +6106,9 @@ define amdgpu_kernel void @local_singlethread_seq_cst_monotonic_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -5931,7 +6155,9 @@ define amdgpu_kernel void @local_singlethread_seq_cst_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -5947,7 +6173,9 @@ define amdgpu_kernel void @local_singlethread_seq_cst_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -5977,7 +6205,9 @@ define amdgpu_kernel void @local_singlethread_seq_cst_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -6079,6 +6309,7 @@ define amdgpu_kernel void @local_singlethread_monotonic_acquire_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -6096,6 +6327,7 @@ define amdgpu_kernel void @local_singlethread_monotonic_acquire_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -6143,6 +6375,7 @@ define amdgpu_kernel void @local_singlethread_monotonic_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -6159,6 +6392,7 @@ define amdgpu_kernel void @local_singlethread_monotonic_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -6189,6 +6423,7 @@ define amdgpu_kernel void @local_singlethread_monotonic_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -6290,6 +6525,7 @@ define amdgpu_kernel void @local_singlethread_acquire_acquire_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -6307,6 +6543,7 @@ define amdgpu_kernel void @local_singlethread_acquire_acquire_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -6354,6 +6591,7 @@ define amdgpu_kernel void @local_singlethread_acquire_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -6370,6 +6608,7 @@ define amdgpu_kernel void @local_singlethread_acquire_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -6400,6 +6639,7 @@ define amdgpu_kernel void @local_singlethread_acquire_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -6500,7 +6740,9 @@ define amdgpu_kernel void @local_singlethread_release_acquire_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -6517,7 +6759,9 @@ define amdgpu_kernel void @local_singlethread_release_acquire_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -6564,7 +6808,9 @@ define amdgpu_kernel void @local_singlethread_release_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -6580,7 +6826,9 @@ define amdgpu_kernel void @local_singlethread_release_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -6610,7 +6858,9 @@ define amdgpu_kernel void @local_singlethread_release_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -6711,7 +6961,9 @@ define amdgpu_kernel void @local_singlethread_acq_rel_acquire_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -6728,7 +6980,9 @@ define amdgpu_kernel void @local_singlethread_acq_rel_acquire_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -6775,7 +7029,9 @@ define amdgpu_kernel void @local_singlethread_acq_rel_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -6791,7 +7047,9 @@ define amdgpu_kernel void @local_singlethread_acq_rel_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -6821,7 +7079,9 @@ define amdgpu_kernel void @local_singlethread_acq_rel_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -6922,7 +7182,9 @@ define amdgpu_kernel void @local_singlethread_seq_cst_acquire_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -6939,7 +7201,9 @@ define amdgpu_kernel void @local_singlethread_seq_cst_acquire_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -6986,7 +7250,9 @@ define amdgpu_kernel void @local_singlethread_seq_cst_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -7002,7 +7268,9 @@ define amdgpu_kernel void @local_singlethread_seq_cst_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -7032,7 +7300,9 @@ define amdgpu_kernel void @local_singlethread_seq_cst_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -7133,7 +7403,9 @@ define amdgpu_kernel void @local_singlethread_monotonic_seq_cst_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -7150,7 +7422,9 @@ define amdgpu_kernel void @local_singlethread_monotonic_seq_cst_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -7197,7 +7471,9 @@ define amdgpu_kernel void @local_singlethread_monotonic_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -7213,7 +7489,9 @@ define amdgpu_kernel void @local_singlethread_monotonic_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -7243,7 +7521,9 @@ define amdgpu_kernel void @local_singlethread_monotonic_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -7344,7 +7624,9 @@ define amdgpu_kernel void @local_singlethread_acquire_seq_cst_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -7361,7 +7643,9 @@ define amdgpu_kernel void @local_singlethread_acquire_seq_cst_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -7408,7 +7692,9 @@ define amdgpu_kernel void @local_singlethread_acquire_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -7424,7 +7710,9 @@ define amdgpu_kernel void @local_singlethread_acquire_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -7454,7 +7742,9 @@ define amdgpu_kernel void @local_singlethread_acquire_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -7555,7 +7845,9 @@ define amdgpu_kernel void @local_singlethread_release_seq_cst_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -7572,7 +7864,9 @@ define amdgpu_kernel void @local_singlethread_release_seq_cst_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -7619,7 +7913,9 @@ define amdgpu_kernel void @local_singlethread_release_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -7635,7 +7931,9 @@ define amdgpu_kernel void @local_singlethread_release_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -7665,7 +7963,9 @@ define amdgpu_kernel void @local_singlethread_release_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -7766,7 +8066,9 @@ define amdgpu_kernel void @local_singlethread_acq_rel_seq_cst_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -7783,7 +8085,9 @@ define amdgpu_kernel void @local_singlethread_acq_rel_seq_cst_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -7830,7 +8134,9 @@ define amdgpu_kernel void @local_singlethread_acq_rel_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -7846,7 +8152,9 @@ define amdgpu_kernel void @local_singlethread_acq_rel_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -7876,7 +8184,9 @@ define amdgpu_kernel void @local_singlethread_acq_rel_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -7977,7 +8287,9 @@ define amdgpu_kernel void @local_singlethread_seq_cst_seq_cst_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -7994,7 +8306,9 @@ define amdgpu_kernel void @local_singlethread_seq_cst_seq_cst_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -8041,7 +8355,9 @@ define amdgpu_kernel void @local_singlethread_seq_cst_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -8057,7 +8373,9 @@ define amdgpu_kernel void @local_singlethread_seq_cst_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -8087,7 +8405,9 @@ define amdgpu_kernel void @local_singlethread_seq_cst_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -8529,6 +8849,7 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_load( ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: ds_read_b32 v1, v0 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -8543,6 +8864,7 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_load( ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: ds_read_b32 v1, v0 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -8581,6 +8903,7 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_load( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: ds_read_b32 v1, v0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -8594,6 +8917,7 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_load( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: ds_read_b32 v1, v0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -8618,6 +8942,7 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_load( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: ds_read_b32 v1, v0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -8699,7 +9024,9 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_load( ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_read_b32 v1, v0 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -8713,7 +9040,9 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_load( ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_read_b32 v1, v0 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -8751,7 +9080,9 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_load( ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_read_b32 v1, v0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -8764,7 +9095,9 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_load( ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_read_b32 v1, v0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -8788,7 +9121,9 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_load( ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_read_b32 v1, v0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -9150,6 +9485,7 @@ define amdgpu_kernel void @local_singlethread_one_as_release_store( ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm ; @@ -9161,6 +9497,7 @@ define amdgpu_kernel void @local_singlethread_one_as_release_store( ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: v_mov_b32_e32 v1, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_write_b32 v0, v1 ; GFX7-NEXT: s_endpgm ; @@ -9192,6 +9529,7 @@ define amdgpu_kernel void @local_singlethread_one_as_release_store( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -9202,6 +9540,7 @@ define amdgpu_kernel void @local_singlethread_one_as_release_store( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -9222,6 +9561,7 @@ define amdgpu_kernel void @local_singlethread_one_as_release_store( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -9290,6 +9630,7 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_store( ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm ; @@ -9301,6 +9642,7 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_store( ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: v_mov_b32_e32 v1, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_write_b32 v0, v1 ; GFX7-NEXT: s_endpgm ; @@ -9332,6 +9674,7 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_store( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -9342,6 +9685,7 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_store( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -9362,6 +9706,7 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_store( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -9571,6 +9916,7 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_atomicrmw( ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_singlethread_one_as_acquire_atomicrmw: @@ -9582,6 +9928,7 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_atomicrmw( ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: v_mov_b32_e32 v1, s4 ; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_singlethread_one_as_acquire_atomicrmw: @@ -9613,6 +9960,7 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_atomicrmw( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_acquire_atomicrmw: @@ -9623,6 +9971,7 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_acquire_atomicrmw: @@ -9643,6 +9992,7 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_singlethread_one_as_acquire_atomicrmw: @@ -9710,6 +10060,7 @@ define amdgpu_kernel void @local_singlethread_one_as_release_atomicrmw( ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX6-NEXT: s_endpgm ; @@ -9721,6 +10072,7 @@ define amdgpu_kernel void @local_singlethread_one_as_release_atomicrmw( ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: v_mov_b32_e32 v1, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX7-NEXT: s_endpgm ; @@ -9752,6 +10104,7 @@ define amdgpu_kernel void @local_singlethread_one_as_release_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -9762,6 +10115,7 @@ define amdgpu_kernel void @local_singlethread_one_as_release_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -9782,6 +10136,7 @@ define amdgpu_kernel void @local_singlethread_one_as_release_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -9850,7 +10205,9 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_atomicrmw( ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_singlethread_one_as_acq_rel_atomicrmw: @@ -9861,7 +10218,9 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_atomicrmw( ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: v_mov_b32_e32 v1, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_singlethread_one_as_acq_rel_atomicrmw: @@ -9892,7 +10251,9 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_acq_rel_atomicrmw: @@ -9902,7 +10263,9 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_acq_rel_atomicrmw: @@ -9922,7 +10285,9 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_singlethread_one_as_acq_rel_atomicrmw: @@ -9990,7 +10355,9 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_atomicrmw( ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_singlethread_one_as_seq_cst_atomicrmw: @@ -10001,7 +10368,9 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_atomicrmw( ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: v_mov_b32_e32 v1, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_singlethread_one_as_seq_cst_atomicrmw: @@ -10032,7 +10401,9 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_seq_cst_atomicrmw: @@ -10042,7 +10413,9 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_seq_cst_atomicrmw: @@ -10062,7 +10435,9 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_singlethread_one_as_seq_cst_atomicrmw: @@ -10131,6 +10506,7 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_ret_atomicrmw( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -10146,6 +10522,7 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_ret_atomicrmw( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -10187,6 +10564,7 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_ret_atomicrmw( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -10201,6 +10579,7 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -10227,6 +10606,7 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_ret_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -10313,7 +10693,9 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_ret_atomicrmw( ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -10328,7 +10710,9 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_ret_atomicrmw( ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -10369,7 +10753,9 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_ret_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -10383,7 +10769,9 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -10409,7 +10797,9 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_ret_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -10496,7 +10886,9 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_ret_atomicrmw( ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -10511,7 +10903,9 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_ret_atomicrmw( ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -10552,7 +10946,9 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_ret_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -10566,7 +10962,9 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -10592,7 +10990,9 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_ret_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -10849,6 +11249,7 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_monotonic_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_singlethread_one_as_acquire_monotonic_cmpxchg: @@ -10862,6 +11263,7 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_monotonic_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_singlethread_one_as_acquire_monotonic_cmpxchg: @@ -10899,6 +11301,7 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_acquire_monotonic_cmpxchg: @@ -10911,6 +11314,7 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_acquire_monotonic_cmpxchg: @@ -10935,6 +11339,7 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_singlethread_one_as_acquire_monotonic_cmpxchg: @@ -11015,6 +11420,7 @@ define amdgpu_kernel void @local_singlethread_one_as_release_monotonic_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX6-NEXT: s_endpgm ; @@ -11028,6 +11434,7 @@ define amdgpu_kernel void @local_singlethread_one_as_release_monotonic_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX7-NEXT: s_endpgm ; @@ -11065,6 +11472,7 @@ define amdgpu_kernel void @local_singlethread_one_as_release_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -11077,6 +11485,7 @@ define amdgpu_kernel void @local_singlethread_one_as_release_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -11101,6 +11510,7 @@ define amdgpu_kernel void @local_singlethread_one_as_release_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -11182,7 +11592,9 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_monotonic_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_singlethread_one_as_acq_rel_monotonic_cmpxchg: @@ -11195,7 +11607,9 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_monotonic_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_singlethread_one_as_acq_rel_monotonic_cmpxchg: @@ -11232,7 +11646,9 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_acq_rel_monotonic_cmpxchg: @@ -11244,7 +11660,9 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_acq_rel_monotonic_cmpxchg: @@ -11268,7 +11686,9 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_singlethread_one_as_acq_rel_monotonic_cmpxchg: @@ -11349,7 +11769,9 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_monotonic_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_singlethread_one_as_seq_cst_monotonic_cmpxchg: @@ -11362,7 +11784,9 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_monotonic_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_singlethread_one_as_seq_cst_monotonic_cmpxchg: @@ -11399,7 +11823,9 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_seq_cst_monotonic_cmpxchg: @@ -11411,7 +11837,9 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_seq_cst_monotonic_cmpxchg: @@ -11435,7 +11863,9 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_singlethread_one_as_seq_cst_monotonic_cmpxchg: @@ -11517,6 +11947,7 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_acquire_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_singlethread_one_as_monotonic_acquire_cmpxchg: @@ -11530,6 +11961,7 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_acquire_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_singlethread_one_as_monotonic_acquire_cmpxchg: @@ -11567,6 +11999,7 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_monotonic_acquire_cmpxchg: @@ -11579,6 +12012,7 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_monotonic_acquire_cmpxchg: @@ -11603,6 +12037,7 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_singlethread_one_as_monotonic_acquire_cmpxchg: @@ -11684,6 +12119,7 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_acquire_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_singlethread_one_as_acquire_acquire_cmpxchg: @@ -11697,6 +12133,7 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_acquire_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_singlethread_one_as_acquire_acquire_cmpxchg: @@ -11734,6 +12171,7 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_acquire_acquire_cmpxchg: @@ -11746,6 +12184,7 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_acquire_acquire_cmpxchg: @@ -11770,6 +12209,7 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_singlethread_one_as_acquire_acquire_cmpxchg: @@ -11850,7 +12290,9 @@ define amdgpu_kernel void @local_singlethread_one_as_release_acquire_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_singlethread_one_as_release_acquire_cmpxchg: @@ -11863,7 +12305,9 @@ define amdgpu_kernel void @local_singlethread_one_as_release_acquire_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_singlethread_one_as_release_acquire_cmpxchg: @@ -11900,7 +12344,9 @@ define amdgpu_kernel void @local_singlethread_one_as_release_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_release_acquire_cmpxchg: @@ -11912,7 +12358,9 @@ define amdgpu_kernel void @local_singlethread_one_as_release_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_release_acquire_cmpxchg: @@ -11936,7 +12384,9 @@ define amdgpu_kernel void @local_singlethread_one_as_release_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_singlethread_one_as_release_acquire_cmpxchg: @@ -12017,7 +12467,9 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_acquire_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_singlethread_one_as_acq_rel_acquire_cmpxchg: @@ -12030,7 +12482,9 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_acquire_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_singlethread_one_as_acq_rel_acquire_cmpxchg: @@ -12067,7 +12521,9 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_acq_rel_acquire_cmpxchg: @@ -12079,7 +12535,9 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_acq_rel_acquire_cmpxchg: @@ -12103,7 +12561,9 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_singlethread_one_as_acq_rel_acquire_cmpxchg: @@ -12184,7 +12644,9 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_acquire_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_singlethread_one_as_seq_cst_acquire_cmpxchg: @@ -12197,7 +12659,9 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_acquire_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_singlethread_one_as_seq_cst_acquire_cmpxchg: @@ -12234,7 +12698,9 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_seq_cst_acquire_cmpxchg: @@ -12246,7 +12712,9 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_seq_cst_acquire_cmpxchg: @@ -12270,7 +12738,9 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_singlethread_one_as_seq_cst_acquire_cmpxchg: @@ -12351,7 +12821,9 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_seq_cst_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_singlethread_one_as_monotonic_seq_cst_cmpxchg: @@ -12364,7 +12836,9 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_singlethread_one_as_monotonic_seq_cst_cmpxchg: @@ -12401,7 +12875,9 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_monotonic_seq_cst_cmpxchg: @@ -12413,7 +12889,9 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_monotonic_seq_cst_cmpxchg: @@ -12437,7 +12915,9 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_singlethread_one_as_monotonic_seq_cst_cmpxchg: @@ -12518,7 +12998,9 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_seq_cst_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_singlethread_one_as_acquire_seq_cst_cmpxchg: @@ -12531,7 +13013,9 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_singlethread_one_as_acquire_seq_cst_cmpxchg: @@ -12568,7 +13052,9 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_acquire_seq_cst_cmpxchg: @@ -12580,7 +13066,9 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_acquire_seq_cst_cmpxchg: @@ -12604,7 +13092,9 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_singlethread_one_as_acquire_seq_cst_cmpxchg: @@ -12685,7 +13175,9 @@ define amdgpu_kernel void @local_singlethread_one_as_release_seq_cst_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_singlethread_one_as_release_seq_cst_cmpxchg: @@ -12698,7 +13190,9 @@ define amdgpu_kernel void @local_singlethread_one_as_release_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_singlethread_one_as_release_seq_cst_cmpxchg: @@ -12735,7 +13229,9 @@ define amdgpu_kernel void @local_singlethread_one_as_release_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_release_seq_cst_cmpxchg: @@ -12747,7 +13243,9 @@ define amdgpu_kernel void @local_singlethread_one_as_release_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_release_seq_cst_cmpxchg: @@ -12771,7 +13269,9 @@ define amdgpu_kernel void @local_singlethread_one_as_release_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_singlethread_one_as_release_seq_cst_cmpxchg: @@ -12852,7 +13352,9 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_seq_cst_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_singlethread_one_as_acq_rel_seq_cst_cmpxchg: @@ -12865,7 +13367,9 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_singlethread_one_as_acq_rel_seq_cst_cmpxchg: @@ -12902,7 +13406,9 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_acq_rel_seq_cst_cmpxchg: @@ -12914,7 +13420,9 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_acq_rel_seq_cst_cmpxchg: @@ -12938,7 +13446,9 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_singlethread_one_as_acq_rel_seq_cst_cmpxchg: @@ -13019,7 +13529,9 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_seq_cst_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_singlethread_one_as_seq_cst_seq_cst_cmpxchg: @@ -13032,7 +13544,9 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_singlethread_one_as_seq_cst_seq_cst_cmpxchg: @@ -13069,7 +13583,9 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_seq_cst_seq_cst_cmpxchg: @@ -13081,7 +13597,9 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_seq_cst_seq_cst_cmpxchg: @@ -13105,7 +13623,9 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_singlethread_one_as_seq_cst_seq_cst_cmpxchg: @@ -13398,6 +13918,7 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_monotonic_ret_cmpxc ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -13415,6 +13936,7 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_monotonic_ret_cmpxc ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -13462,6 +13984,7 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_monotonic_ret_cmpxc ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -13478,6 +14001,7 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_monotonic_ret_cmpxc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -13508,6 +14032,7 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_monotonic_ret_cmpxc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -13608,6 +14133,7 @@ define amdgpu_kernel void @local_singlethread_one_as_release_monotonic_ret_cmpxc ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 @@ -13625,6 +14151,7 @@ define amdgpu_kernel void @local_singlethread_one_as_release_monotonic_ret_cmpxc ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -13672,6 +14199,7 @@ define amdgpu_kernel void @local_singlethread_one_as_release_monotonic_ret_cmpxc ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 @@ -13688,6 +14216,7 @@ define amdgpu_kernel void @local_singlethread_one_as_release_monotonic_ret_cmpxc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -13718,6 +14247,7 @@ define amdgpu_kernel void @local_singlethread_one_as_release_monotonic_ret_cmpxc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -13819,7 +14349,9 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_monotonic_ret_cmpxc ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -13836,7 +14368,9 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_monotonic_ret_cmpxc ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -13883,7 +14417,9 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_monotonic_ret_cmpxc ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -13899,7 +14435,9 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_monotonic_ret_cmpxc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -13929,7 +14467,9 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_monotonic_ret_cmpxc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -14030,7 +14570,9 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_monotonic_ret_cmpxc ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -14047,7 +14589,9 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_monotonic_ret_cmpxc ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -14094,7 +14638,9 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_monotonic_ret_cmpxc ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -14110,7 +14656,9 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_monotonic_ret_cmpxc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -14140,7 +14688,9 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_monotonic_ret_cmpxc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -14242,6 +14792,7 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_acquire_ret_cmpxc ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -14259,6 +14810,7 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_acquire_ret_cmpxc ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -14306,6 +14858,7 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_acquire_ret_cmpxc ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -14322,6 +14875,7 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_acquire_ret_cmpxc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -14352,6 +14906,7 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_acquire_ret_cmpxc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -14453,6 +15008,7 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_acquire_ret_cmpxchg ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -14470,6 +15026,7 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_acquire_ret_cmpxchg ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -14517,6 +15074,7 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_acquire_ret_cmpxchg ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -14533,6 +15091,7 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_acquire_ret_cmpxchg ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -14563,6 +15122,7 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_acquire_ret_cmpxchg ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -14663,7 +15223,9 @@ define amdgpu_kernel void @local_singlethread_one_as_release_acquire_ret_cmpxchg ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -14680,7 +15242,9 @@ define amdgpu_kernel void @local_singlethread_one_as_release_acquire_ret_cmpxchg ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -14727,7 +15291,9 @@ define amdgpu_kernel void @local_singlethread_one_as_release_acquire_ret_cmpxchg ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -14743,7 +15309,9 @@ define amdgpu_kernel void @local_singlethread_one_as_release_acquire_ret_cmpxchg ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -14773,7 +15341,9 @@ define amdgpu_kernel void @local_singlethread_one_as_release_acquire_ret_cmpxchg ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -14874,7 +15444,9 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_acquire_ret_cmpxchg ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -14891,7 +15463,9 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_acquire_ret_cmpxchg ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -14938,7 +15512,9 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_acquire_ret_cmpxchg ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -14954,7 +15530,9 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_acquire_ret_cmpxchg ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -14984,7 +15562,9 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_acquire_ret_cmpxchg ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -15085,7 +15665,9 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_acquire_ret_cmpxchg ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -15102,7 +15684,9 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_acquire_ret_cmpxchg ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -15149,7 +15733,9 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_acquire_ret_cmpxchg ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -15165,7 +15751,9 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_acquire_ret_cmpxchg ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -15195,7 +15783,9 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_acquire_ret_cmpxchg ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -15296,7 +15886,9 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_seq_cst_ret_cmpxc ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -15313,7 +15905,9 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_seq_cst_ret_cmpxc ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -15360,7 +15954,9 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_seq_cst_ret_cmpxc ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -15376,7 +15972,9 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_seq_cst_ret_cmpxc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -15406,7 +16004,9 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_seq_cst_ret_cmpxc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -15507,7 +16107,9 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_seq_cst_ret_cmpxchg ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -15524,7 +16126,9 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_seq_cst_ret_cmpxchg ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -15571,7 +16175,9 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_seq_cst_ret_cmpxchg ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -15587,7 +16193,9 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_seq_cst_ret_cmpxchg ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -15617,7 +16225,9 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_seq_cst_ret_cmpxchg ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -15718,7 +16328,9 @@ define amdgpu_kernel void @local_singlethread_one_as_release_seq_cst_ret_cmpxchg ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -15735,7 +16347,9 @@ define amdgpu_kernel void @local_singlethread_one_as_release_seq_cst_ret_cmpxchg ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -15782,7 +16396,9 @@ define amdgpu_kernel void @local_singlethread_one_as_release_seq_cst_ret_cmpxchg ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -15798,7 +16414,9 @@ define amdgpu_kernel void @local_singlethread_one_as_release_seq_cst_ret_cmpxchg ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -15828,7 +16446,9 @@ define amdgpu_kernel void @local_singlethread_one_as_release_seq_cst_ret_cmpxchg ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -15929,7 +16549,9 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -15946,7 +16568,9 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -15993,7 +16617,9 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -16009,7 +16635,9 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -16039,7 +16667,9 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -16140,7 +16770,9 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -16157,7 +16789,9 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -16204,7 +16838,9 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -16220,7 +16856,9 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -16250,7 +16888,9 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-system.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-system.ll index 74a297241d851..7b8493960a629 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-system.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-system.ll @@ -366,7 +366,7 @@ define amdgpu_kernel void @local_system_acquire_load( ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: ds_read_b32 v1, v0 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: ds_write_b32 v0, v1 @@ -380,7 +380,7 @@ define amdgpu_kernel void @local_system_acquire_load( ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: ds_read_b32 v1, v0 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: ds_write_b32 v0, v1 @@ -419,7 +419,7 @@ define amdgpu_kernel void @local_system_acquire_load( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: ds_read_b32 v1, v0 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 @@ -432,7 +432,7 @@ define amdgpu_kernel void @local_system_acquire_load( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: ds_read_b32 v1, v0 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm @@ -457,7 +457,7 @@ define amdgpu_kernel void @local_system_acquire_load( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: ds_read_b32 v1, v0 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm @@ -541,9 +541,9 @@ define amdgpu_kernel void @local_system_seq_cst_load( ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: ds_read_b32 v1, v0 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: ds_write_b32 v0, v1 @@ -556,9 +556,9 @@ define amdgpu_kernel void @local_system_seq_cst_load( ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: ds_read_b32 v1, v0 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: ds_write_b32 v0, v1 @@ -599,9 +599,9 @@ define amdgpu_kernel void @local_system_seq_cst_load( ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_read_b32 v1, v0 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 @@ -613,9 +613,9 @@ define amdgpu_kernel void @local_system_seq_cst_load( ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_read_b32 v1, v0 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm @@ -640,9 +640,9 @@ define amdgpu_kernel void @local_system_seq_cst_load( ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_read_b32 v1, v0 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm @@ -1015,7 +1015,7 @@ define amdgpu_kernel void @local_system_release_store( ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm ; @@ -1027,7 +1027,7 @@ define amdgpu_kernel void @local_system_release_store( ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: v_mov_b32_e32 v1, s4 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: ds_write_b32 v0, v1 ; GFX7-NEXT: s_endpgm ; @@ -1062,7 +1062,7 @@ define amdgpu_kernel void @local_system_release_store( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -1073,7 +1073,7 @@ define amdgpu_kernel void @local_system_release_store( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -1095,7 +1095,7 @@ define amdgpu_kernel void @local_system_release_store( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -1173,7 +1173,7 @@ define amdgpu_kernel void @local_system_seq_cst_store( ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm ; @@ -1185,7 +1185,7 @@ define amdgpu_kernel void @local_system_seq_cst_store( ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: v_mov_b32_e32 v1, s4 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: ds_write_b32 v0, v1 ; GFX7-NEXT: s_endpgm ; @@ -1220,7 +1220,7 @@ define amdgpu_kernel void @local_system_seq_cst_store( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -1231,7 +1231,7 @@ define amdgpu_kernel void @local_system_seq_cst_store( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -1253,7 +1253,7 @@ define amdgpu_kernel void @local_system_seq_cst_store( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -1472,7 +1472,7 @@ define amdgpu_kernel void @local_system_acquire_atomicrmw( ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_system_acquire_atomicrmw: @@ -1484,7 +1484,7 @@ define amdgpu_kernel void @local_system_acquire_atomicrmw( ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: v_mov_b32_e32 v1, s4 ; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_system_acquire_atomicrmw: @@ -1519,7 +1519,7 @@ define amdgpu_kernel void @local_system_acquire_atomicrmw( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_system_acquire_atomicrmw: @@ -1530,7 +1530,7 @@ define amdgpu_kernel void @local_system_acquire_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_system_acquire_atomicrmw: @@ -1552,7 +1552,7 @@ define amdgpu_kernel void @local_system_acquire_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_system_acquire_atomicrmw: @@ -1627,7 +1627,7 @@ define amdgpu_kernel void @local_system_release_atomicrmw( ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX6-NEXT: s_endpgm ; @@ -1639,7 +1639,7 @@ define amdgpu_kernel void @local_system_release_atomicrmw( ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: v_mov_b32_e32 v1, s4 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX7-NEXT: s_endpgm ; @@ -1674,7 +1674,7 @@ define amdgpu_kernel void @local_system_release_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -1685,7 +1685,7 @@ define amdgpu_kernel void @local_system_release_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -1707,7 +1707,7 @@ define amdgpu_kernel void @local_system_release_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -1785,9 +1785,9 @@ define amdgpu_kernel void @local_system_acq_rel_atomicrmw( ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_system_acq_rel_atomicrmw: @@ -1798,9 +1798,9 @@ define amdgpu_kernel void @local_system_acq_rel_atomicrmw( ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: v_mov_b32_e32 v1, s4 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_system_acq_rel_atomicrmw: @@ -1837,9 +1837,9 @@ define amdgpu_kernel void @local_system_acq_rel_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_system_acq_rel_atomicrmw: @@ -1849,9 +1849,9 @@ define amdgpu_kernel void @local_system_acq_rel_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_system_acq_rel_atomicrmw: @@ -1873,9 +1873,9 @@ define amdgpu_kernel void @local_system_acq_rel_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_system_acq_rel_atomicrmw: @@ -1959,9 +1959,9 @@ define amdgpu_kernel void @local_system_seq_cst_atomicrmw( ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_system_seq_cst_atomicrmw: @@ -1972,9 +1972,9 @@ define amdgpu_kernel void @local_system_seq_cst_atomicrmw( ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: v_mov_b32_e32 v1, s4 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_system_seq_cst_atomicrmw: @@ -2011,9 +2011,9 @@ define amdgpu_kernel void @local_system_seq_cst_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_system_seq_cst_atomicrmw: @@ -2023,9 +2023,9 @@ define amdgpu_kernel void @local_system_seq_cst_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_system_seq_cst_atomicrmw: @@ -2047,9 +2047,9 @@ define amdgpu_kernel void @local_system_seq_cst_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_system_seq_cst_atomicrmw: @@ -2134,7 +2134,7 @@ define amdgpu_kernel void @local_system_acquire_ret_atomicrmw( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: ds_write_b32 v0, v1 @@ -2149,7 +2149,7 @@ define amdgpu_kernel void @local_system_acquire_ret_atomicrmw( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: ds_write_b32 v0, v1 @@ -2191,7 +2191,7 @@ define amdgpu_kernel void @local_system_acquire_ret_atomicrmw( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 @@ -2205,7 +2205,7 @@ define amdgpu_kernel void @local_system_acquire_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm @@ -2232,7 +2232,7 @@ define amdgpu_kernel void @local_system_acquire_ret_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm @@ -2321,9 +2321,9 @@ define amdgpu_kernel void @local_system_acq_rel_ret_atomicrmw( ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: ds_write_b32 v0, v1 @@ -2337,9 +2337,9 @@ define amdgpu_kernel void @local_system_acq_rel_ret_atomicrmw( ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: ds_write_b32 v0, v1 @@ -2383,9 +2383,9 @@ define amdgpu_kernel void @local_system_acq_rel_ret_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 @@ -2398,9 +2398,9 @@ define amdgpu_kernel void @local_system_acq_rel_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm @@ -2427,9 +2427,9 @@ define amdgpu_kernel void @local_system_acq_rel_ret_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm @@ -2527,9 +2527,9 @@ define amdgpu_kernel void @local_system_seq_cst_ret_atomicrmw( ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: ds_write_b32 v0, v1 @@ -2543,9 +2543,9 @@ define amdgpu_kernel void @local_system_seq_cst_ret_atomicrmw( ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: ds_write_b32 v0, v1 @@ -2589,9 +2589,9 @@ define amdgpu_kernel void @local_system_seq_cst_ret_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 @@ -2604,9 +2604,9 @@ define amdgpu_kernel void @local_system_seq_cst_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm @@ -2633,9 +2633,9 @@ define amdgpu_kernel void @local_system_seq_cst_ret_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm @@ -2903,7 +2903,7 @@ define amdgpu_kernel void @local_system_acquire_monotonic_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_system_acquire_monotonic_cmpxchg: @@ -2917,7 +2917,7 @@ define amdgpu_kernel void @local_system_acquire_monotonic_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_system_acquire_monotonic_cmpxchg: @@ -2958,7 +2958,7 @@ define amdgpu_kernel void @local_system_acquire_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_system_acquire_monotonic_cmpxchg: @@ -2971,7 +2971,7 @@ define amdgpu_kernel void @local_system_acquire_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_system_acquire_monotonic_cmpxchg: @@ -2997,7 +2997,7 @@ define amdgpu_kernel void @local_system_acquire_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_system_acquire_monotonic_cmpxchg: @@ -3085,7 +3085,7 @@ define amdgpu_kernel void @local_system_release_monotonic_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX6-NEXT: s_endpgm ; @@ -3099,7 +3099,7 @@ define amdgpu_kernel void @local_system_release_monotonic_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX7-NEXT: s_endpgm ; @@ -3140,7 +3140,7 @@ define amdgpu_kernel void @local_system_release_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -3153,7 +3153,7 @@ define amdgpu_kernel void @local_system_release_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -3179,7 +3179,7 @@ define amdgpu_kernel void @local_system_release_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -3270,9 +3270,9 @@ define amdgpu_kernel void @local_system_acq_rel_monotonic_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_system_acq_rel_monotonic_cmpxchg: @@ -3285,9 +3285,9 @@ define amdgpu_kernel void @local_system_acq_rel_monotonic_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_system_acq_rel_monotonic_cmpxchg: @@ -3330,9 +3330,9 @@ define amdgpu_kernel void @local_system_acq_rel_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_system_acq_rel_monotonic_cmpxchg: @@ -3344,9 +3344,9 @@ define amdgpu_kernel void @local_system_acq_rel_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_system_acq_rel_monotonic_cmpxchg: @@ -3372,9 +3372,9 @@ define amdgpu_kernel void @local_system_acq_rel_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_system_acq_rel_monotonic_cmpxchg: @@ -3471,9 +3471,9 @@ define amdgpu_kernel void @local_system_seq_cst_monotonic_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_system_seq_cst_monotonic_cmpxchg: @@ -3486,9 +3486,9 @@ define amdgpu_kernel void @local_system_seq_cst_monotonic_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_system_seq_cst_monotonic_cmpxchg: @@ -3531,9 +3531,9 @@ define amdgpu_kernel void @local_system_seq_cst_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_system_seq_cst_monotonic_cmpxchg: @@ -3545,9 +3545,9 @@ define amdgpu_kernel void @local_system_seq_cst_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_system_seq_cst_monotonic_cmpxchg: @@ -3573,9 +3573,9 @@ define amdgpu_kernel void @local_system_seq_cst_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_system_seq_cst_monotonic_cmpxchg: @@ -3673,7 +3673,7 @@ define amdgpu_kernel void @local_system_monotonic_acquire_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_system_monotonic_acquire_cmpxchg: @@ -3687,7 +3687,7 @@ define amdgpu_kernel void @local_system_monotonic_acquire_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_system_monotonic_acquire_cmpxchg: @@ -3728,7 +3728,7 @@ define amdgpu_kernel void @local_system_monotonic_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_system_monotonic_acquire_cmpxchg: @@ -3741,7 +3741,7 @@ define amdgpu_kernel void @local_system_monotonic_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_system_monotonic_acquire_cmpxchg: @@ -3767,7 +3767,7 @@ define amdgpu_kernel void @local_system_monotonic_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_system_monotonic_acquire_cmpxchg: @@ -3856,7 +3856,7 @@ define amdgpu_kernel void @local_system_acquire_acquire_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_system_acquire_acquire_cmpxchg: @@ -3870,7 +3870,7 @@ define amdgpu_kernel void @local_system_acquire_acquire_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_system_acquire_acquire_cmpxchg: @@ -3911,7 +3911,7 @@ define amdgpu_kernel void @local_system_acquire_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_system_acquire_acquire_cmpxchg: @@ -3924,7 +3924,7 @@ define amdgpu_kernel void @local_system_acquire_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_system_acquire_acquire_cmpxchg: @@ -3950,7 +3950,7 @@ define amdgpu_kernel void @local_system_acquire_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_system_acquire_acquire_cmpxchg: @@ -4038,9 +4038,9 @@ define amdgpu_kernel void @local_system_release_acquire_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_system_release_acquire_cmpxchg: @@ -4053,9 +4053,9 @@ define amdgpu_kernel void @local_system_release_acquire_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_system_release_acquire_cmpxchg: @@ -4098,9 +4098,9 @@ define amdgpu_kernel void @local_system_release_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_system_release_acquire_cmpxchg: @@ -4112,9 +4112,9 @@ define amdgpu_kernel void @local_system_release_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_system_release_acquire_cmpxchg: @@ -4140,9 +4140,9 @@ define amdgpu_kernel void @local_system_release_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_system_release_acquire_cmpxchg: @@ -4239,9 +4239,9 @@ define amdgpu_kernel void @local_system_acq_rel_acquire_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_system_acq_rel_acquire_cmpxchg: @@ -4254,9 +4254,9 @@ define amdgpu_kernel void @local_system_acq_rel_acquire_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_system_acq_rel_acquire_cmpxchg: @@ -4299,9 +4299,9 @@ define amdgpu_kernel void @local_system_acq_rel_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_system_acq_rel_acquire_cmpxchg: @@ -4313,9 +4313,9 @@ define amdgpu_kernel void @local_system_acq_rel_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_system_acq_rel_acquire_cmpxchg: @@ -4341,9 +4341,9 @@ define amdgpu_kernel void @local_system_acq_rel_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_system_acq_rel_acquire_cmpxchg: @@ -4440,9 +4440,9 @@ define amdgpu_kernel void @local_system_seq_cst_acquire_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_system_seq_cst_acquire_cmpxchg: @@ -4455,9 +4455,9 @@ define amdgpu_kernel void @local_system_seq_cst_acquire_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_system_seq_cst_acquire_cmpxchg: @@ -4500,9 +4500,9 @@ define amdgpu_kernel void @local_system_seq_cst_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_system_seq_cst_acquire_cmpxchg: @@ -4514,9 +4514,9 @@ define amdgpu_kernel void @local_system_seq_cst_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_system_seq_cst_acquire_cmpxchg: @@ -4542,9 +4542,9 @@ define amdgpu_kernel void @local_system_seq_cst_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_system_seq_cst_acquire_cmpxchg: @@ -4641,9 +4641,9 @@ define amdgpu_kernel void @local_system_monotonic_seq_cst_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_system_monotonic_seq_cst_cmpxchg: @@ -4656,9 +4656,9 @@ define amdgpu_kernel void @local_system_monotonic_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_system_monotonic_seq_cst_cmpxchg: @@ -4701,9 +4701,9 @@ define amdgpu_kernel void @local_system_monotonic_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_system_monotonic_seq_cst_cmpxchg: @@ -4715,9 +4715,9 @@ define amdgpu_kernel void @local_system_monotonic_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_system_monotonic_seq_cst_cmpxchg: @@ -4743,9 +4743,9 @@ define amdgpu_kernel void @local_system_monotonic_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_system_monotonic_seq_cst_cmpxchg: @@ -4842,9 +4842,9 @@ define amdgpu_kernel void @local_system_acquire_seq_cst_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_system_acquire_seq_cst_cmpxchg: @@ -4857,9 +4857,9 @@ define amdgpu_kernel void @local_system_acquire_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_system_acquire_seq_cst_cmpxchg: @@ -4902,9 +4902,9 @@ define amdgpu_kernel void @local_system_acquire_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_system_acquire_seq_cst_cmpxchg: @@ -4916,9 +4916,9 @@ define amdgpu_kernel void @local_system_acquire_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_system_acquire_seq_cst_cmpxchg: @@ -4944,9 +4944,9 @@ define amdgpu_kernel void @local_system_acquire_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_system_acquire_seq_cst_cmpxchg: @@ -5043,9 +5043,9 @@ define amdgpu_kernel void @local_system_release_seq_cst_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_system_release_seq_cst_cmpxchg: @@ -5058,9 +5058,9 @@ define amdgpu_kernel void @local_system_release_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_system_release_seq_cst_cmpxchg: @@ -5103,9 +5103,9 @@ define amdgpu_kernel void @local_system_release_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_system_release_seq_cst_cmpxchg: @@ -5117,9 +5117,9 @@ define amdgpu_kernel void @local_system_release_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_system_release_seq_cst_cmpxchg: @@ -5145,9 +5145,9 @@ define amdgpu_kernel void @local_system_release_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_system_release_seq_cst_cmpxchg: @@ -5244,9 +5244,9 @@ define amdgpu_kernel void @local_system_acq_rel_seq_cst_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_system_acq_rel_seq_cst_cmpxchg: @@ -5259,9 +5259,9 @@ define amdgpu_kernel void @local_system_acq_rel_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_system_acq_rel_seq_cst_cmpxchg: @@ -5304,9 +5304,9 @@ define amdgpu_kernel void @local_system_acq_rel_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_system_acq_rel_seq_cst_cmpxchg: @@ -5318,9 +5318,9 @@ define amdgpu_kernel void @local_system_acq_rel_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_system_acq_rel_seq_cst_cmpxchg: @@ -5346,9 +5346,9 @@ define amdgpu_kernel void @local_system_acq_rel_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_system_acq_rel_seq_cst_cmpxchg: @@ -5445,9 +5445,9 @@ define amdgpu_kernel void @local_system_seq_cst_seq_cst_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_system_seq_cst_seq_cst_cmpxchg: @@ -5460,9 +5460,9 @@ define amdgpu_kernel void @local_system_seq_cst_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_system_seq_cst_seq_cst_cmpxchg: @@ -5505,9 +5505,9 @@ define amdgpu_kernel void @local_system_seq_cst_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_system_seq_cst_seq_cst_cmpxchg: @@ -5519,9 +5519,9 @@ define amdgpu_kernel void @local_system_seq_cst_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_system_seq_cst_seq_cst_cmpxchg: @@ -5547,9 +5547,9 @@ define amdgpu_kernel void @local_system_seq_cst_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_system_seq_cst_seq_cst_cmpxchg: @@ -5858,7 +5858,7 @@ define amdgpu_kernel void @local_system_acquire_monotonic_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: ds_write_b32 v0, v1 @@ -5875,7 +5875,7 @@ define amdgpu_kernel void @local_system_acquire_monotonic_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: ds_write_b32 v0, v1 @@ -5923,7 +5923,7 @@ define amdgpu_kernel void @local_system_acquire_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 @@ -5939,7 +5939,7 @@ define amdgpu_kernel void @local_system_acquire_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm @@ -5970,7 +5970,7 @@ define amdgpu_kernel void @local_system_acquire_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm @@ -6073,7 +6073,7 @@ define amdgpu_kernel void @local_system_release_monotonic_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 @@ -6091,7 +6091,7 @@ define amdgpu_kernel void @local_system_release_monotonic_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -6142,7 +6142,7 @@ define amdgpu_kernel void @local_system_release_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 @@ -6159,7 +6159,7 @@ define amdgpu_kernel void @local_system_release_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -6191,7 +6191,7 @@ define amdgpu_kernel void @local_system_release_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -6302,9 +6302,9 @@ define amdgpu_kernel void @local_system_acq_rel_monotonic_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: ds_write_b32 v0, v1 @@ -6320,9 +6320,9 @@ define amdgpu_kernel void @local_system_acq_rel_monotonic_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: ds_write_b32 v0, v1 @@ -6372,9 +6372,9 @@ define amdgpu_kernel void @local_system_acq_rel_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 @@ -6389,9 +6389,9 @@ define amdgpu_kernel void @local_system_acq_rel_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm @@ -6422,9 +6422,9 @@ define amdgpu_kernel void @local_system_acq_rel_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm @@ -6536,9 +6536,9 @@ define amdgpu_kernel void @local_system_seq_cst_monotonic_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: ds_write_b32 v0, v1 @@ -6554,9 +6554,9 @@ define amdgpu_kernel void @local_system_seq_cst_monotonic_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: ds_write_b32 v0, v1 @@ -6606,9 +6606,9 @@ define amdgpu_kernel void @local_system_seq_cst_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 @@ -6623,9 +6623,9 @@ define amdgpu_kernel void @local_system_seq_cst_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm @@ -6656,9 +6656,9 @@ define amdgpu_kernel void @local_system_seq_cst_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm @@ -6771,7 +6771,7 @@ define amdgpu_kernel void @local_system_monotonic_acquire_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: ds_write_b32 v0, v1 @@ -6788,7 +6788,7 @@ define amdgpu_kernel void @local_system_monotonic_acquire_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: ds_write_b32 v0, v1 @@ -6836,7 +6836,7 @@ define amdgpu_kernel void @local_system_monotonic_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 @@ -6852,7 +6852,7 @@ define amdgpu_kernel void @local_system_monotonic_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm @@ -6883,7 +6883,7 @@ define amdgpu_kernel void @local_system_monotonic_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm @@ -6987,7 +6987,7 @@ define amdgpu_kernel void @local_system_acquire_acquire_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: ds_write_b32 v0, v1 @@ -7004,7 +7004,7 @@ define amdgpu_kernel void @local_system_acquire_acquire_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: ds_write_b32 v0, v1 @@ -7052,7 +7052,7 @@ define amdgpu_kernel void @local_system_acquire_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 @@ -7068,7 +7068,7 @@ define amdgpu_kernel void @local_system_acquire_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm @@ -7099,7 +7099,7 @@ define amdgpu_kernel void @local_system_acquire_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm @@ -7202,9 +7202,9 @@ define amdgpu_kernel void @local_system_release_acquire_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: ds_write_b32 v0, v1 @@ -7220,9 +7220,9 @@ define amdgpu_kernel void @local_system_release_acquire_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: ds_write_b32 v0, v1 @@ -7272,9 +7272,9 @@ define amdgpu_kernel void @local_system_release_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 @@ -7289,9 +7289,9 @@ define amdgpu_kernel void @local_system_release_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm @@ -7322,9 +7322,9 @@ define amdgpu_kernel void @local_system_release_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm @@ -7436,9 +7436,9 @@ define amdgpu_kernel void @local_system_acq_rel_acquire_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: ds_write_b32 v0, v1 @@ -7454,9 +7454,9 @@ define amdgpu_kernel void @local_system_acq_rel_acquire_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: ds_write_b32 v0, v1 @@ -7506,9 +7506,9 @@ define amdgpu_kernel void @local_system_acq_rel_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 @@ -7523,9 +7523,9 @@ define amdgpu_kernel void @local_system_acq_rel_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm @@ -7556,9 +7556,9 @@ define amdgpu_kernel void @local_system_acq_rel_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm @@ -7670,9 +7670,9 @@ define amdgpu_kernel void @local_system_seq_cst_acquire_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: ds_write_b32 v0, v1 @@ -7688,9 +7688,9 @@ define amdgpu_kernel void @local_system_seq_cst_acquire_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: ds_write_b32 v0, v1 @@ -7740,9 +7740,9 @@ define amdgpu_kernel void @local_system_seq_cst_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 @@ -7757,9 +7757,9 @@ define amdgpu_kernel void @local_system_seq_cst_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm @@ -7790,9 +7790,9 @@ define amdgpu_kernel void @local_system_seq_cst_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm @@ -7904,9 +7904,9 @@ define amdgpu_kernel void @local_system_monotonic_seq_cst_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: ds_write_b32 v0, v1 @@ -7922,9 +7922,9 @@ define amdgpu_kernel void @local_system_monotonic_seq_cst_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: ds_write_b32 v0, v1 @@ -7974,9 +7974,9 @@ define amdgpu_kernel void @local_system_monotonic_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 @@ -7991,9 +7991,9 @@ define amdgpu_kernel void @local_system_monotonic_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm @@ -8024,9 +8024,9 @@ define amdgpu_kernel void @local_system_monotonic_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm @@ -8138,9 +8138,9 @@ define amdgpu_kernel void @local_system_acquire_seq_cst_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: ds_write_b32 v0, v1 @@ -8156,9 +8156,9 @@ define amdgpu_kernel void @local_system_acquire_seq_cst_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: ds_write_b32 v0, v1 @@ -8208,9 +8208,9 @@ define amdgpu_kernel void @local_system_acquire_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 @@ -8225,9 +8225,9 @@ define amdgpu_kernel void @local_system_acquire_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm @@ -8258,9 +8258,9 @@ define amdgpu_kernel void @local_system_acquire_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm @@ -8372,9 +8372,9 @@ define amdgpu_kernel void @local_system_release_seq_cst_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: ds_write_b32 v0, v1 @@ -8390,9 +8390,9 @@ define amdgpu_kernel void @local_system_release_seq_cst_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: ds_write_b32 v0, v1 @@ -8442,9 +8442,9 @@ define amdgpu_kernel void @local_system_release_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 @@ -8459,9 +8459,9 @@ define amdgpu_kernel void @local_system_release_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm @@ -8492,9 +8492,9 @@ define amdgpu_kernel void @local_system_release_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm @@ -8606,9 +8606,9 @@ define amdgpu_kernel void @local_system_acq_rel_seq_cst_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: ds_write_b32 v0, v1 @@ -8624,9 +8624,9 @@ define amdgpu_kernel void @local_system_acq_rel_seq_cst_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: ds_write_b32 v0, v1 @@ -8676,9 +8676,9 @@ define amdgpu_kernel void @local_system_acq_rel_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 @@ -8693,9 +8693,9 @@ define amdgpu_kernel void @local_system_acq_rel_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm @@ -8726,9 +8726,9 @@ define amdgpu_kernel void @local_system_acq_rel_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm @@ -8840,9 +8840,9 @@ define amdgpu_kernel void @local_system_seq_cst_seq_cst_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: ds_write_b32 v0, v1 @@ -8858,9 +8858,9 @@ define amdgpu_kernel void @local_system_seq_cst_seq_cst_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: ds_write_b32 v0, v1 @@ -8910,9 +8910,9 @@ define amdgpu_kernel void @local_system_seq_cst_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 @@ -8927,9 +8927,9 @@ define amdgpu_kernel void @local_system_seq_cst_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm @@ -8960,9 +8960,9 @@ define amdgpu_kernel void @local_system_seq_cst_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm @@ -9415,6 +9415,7 @@ define amdgpu_kernel void @local_system_one_as_acquire_load( ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: ds_read_b32 v1, v0 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -9429,6 +9430,7 @@ define amdgpu_kernel void @local_system_one_as_acquire_load( ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: ds_read_b32 v1, v0 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -9467,6 +9469,7 @@ define amdgpu_kernel void @local_system_one_as_acquire_load( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: ds_read_b32 v1, v0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -9480,6 +9483,7 @@ define amdgpu_kernel void @local_system_one_as_acquire_load( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: ds_read_b32 v1, v0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -9504,6 +9508,7 @@ define amdgpu_kernel void @local_system_one_as_acquire_load( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: ds_read_b32 v1, v0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -9585,7 +9590,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_load( ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_read_b32 v1, v0 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -9599,7 +9606,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_load( ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_read_b32 v1, v0 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -9637,7 +9646,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_load( ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_read_b32 v1, v0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -9650,7 +9661,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_load( ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_read_b32 v1, v0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -9674,7 +9687,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_load( ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_read_b32 v1, v0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -10036,6 +10051,7 @@ define amdgpu_kernel void @local_system_one_as_release_store( ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm ; @@ -10047,6 +10063,7 @@ define amdgpu_kernel void @local_system_one_as_release_store( ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: v_mov_b32_e32 v1, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_write_b32 v0, v1 ; GFX7-NEXT: s_endpgm ; @@ -10078,6 +10095,7 @@ define amdgpu_kernel void @local_system_one_as_release_store( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -10088,6 +10106,7 @@ define amdgpu_kernel void @local_system_one_as_release_store( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -10108,6 +10127,7 @@ define amdgpu_kernel void @local_system_one_as_release_store( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -10176,6 +10196,7 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_store( ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm ; @@ -10187,6 +10208,7 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_store( ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: v_mov_b32_e32 v1, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_write_b32 v0, v1 ; GFX7-NEXT: s_endpgm ; @@ -10218,6 +10240,7 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_store( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -10228,6 +10251,7 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_store( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -10248,6 +10272,7 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_store( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -10457,6 +10482,7 @@ define amdgpu_kernel void @local_system_one_as_acquire_atomicrmw( ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_system_one_as_acquire_atomicrmw: @@ -10468,6 +10494,7 @@ define amdgpu_kernel void @local_system_one_as_acquire_atomicrmw( ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: v_mov_b32_e32 v1, s4 ; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_system_one_as_acquire_atomicrmw: @@ -10499,6 +10526,7 @@ define amdgpu_kernel void @local_system_one_as_acquire_atomicrmw( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_acquire_atomicrmw: @@ -10509,6 +10537,7 @@ define amdgpu_kernel void @local_system_one_as_acquire_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_system_one_as_acquire_atomicrmw: @@ -10529,6 +10558,7 @@ define amdgpu_kernel void @local_system_one_as_acquire_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_system_one_as_acquire_atomicrmw: @@ -10596,6 +10626,7 @@ define amdgpu_kernel void @local_system_one_as_release_atomicrmw( ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX6-NEXT: s_endpgm ; @@ -10607,6 +10638,7 @@ define amdgpu_kernel void @local_system_one_as_release_atomicrmw( ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: v_mov_b32_e32 v1, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX7-NEXT: s_endpgm ; @@ -10638,6 +10670,7 @@ define amdgpu_kernel void @local_system_one_as_release_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -10648,6 +10681,7 @@ define amdgpu_kernel void @local_system_one_as_release_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -10668,6 +10702,7 @@ define amdgpu_kernel void @local_system_one_as_release_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -10736,7 +10771,9 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_atomicrmw( ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_system_one_as_acq_rel_atomicrmw: @@ -10747,7 +10784,9 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_atomicrmw( ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: v_mov_b32_e32 v1, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_system_one_as_acq_rel_atomicrmw: @@ -10778,7 +10817,9 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_acq_rel_atomicrmw: @@ -10788,7 +10829,9 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_system_one_as_acq_rel_atomicrmw: @@ -10808,7 +10851,9 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_system_one_as_acq_rel_atomicrmw: @@ -10876,7 +10921,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_atomicrmw( ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_system_one_as_seq_cst_atomicrmw: @@ -10887,7 +10934,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_atomicrmw( ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: v_mov_b32_e32 v1, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_system_one_as_seq_cst_atomicrmw: @@ -10918,7 +10967,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_seq_cst_atomicrmw: @@ -10928,7 +10979,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_system_one_as_seq_cst_atomicrmw: @@ -10948,7 +11001,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_system_one_as_seq_cst_atomicrmw: @@ -11017,6 +11072,7 @@ define amdgpu_kernel void @local_system_one_as_acquire_ret_atomicrmw( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -11032,6 +11088,7 @@ define amdgpu_kernel void @local_system_one_as_acquire_ret_atomicrmw( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -11073,6 +11130,7 @@ define amdgpu_kernel void @local_system_one_as_acquire_ret_atomicrmw( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -11087,6 +11145,7 @@ define amdgpu_kernel void @local_system_one_as_acquire_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -11113,6 +11172,7 @@ define amdgpu_kernel void @local_system_one_as_acquire_ret_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -11199,7 +11259,9 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_ret_atomicrmw( ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -11214,7 +11276,9 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_ret_atomicrmw( ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -11255,7 +11319,9 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_ret_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -11269,7 +11335,9 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -11295,7 +11363,9 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_ret_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -11382,7 +11452,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_ret_atomicrmw( ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -11397,7 +11469,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_ret_atomicrmw( ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -11438,7 +11512,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_ret_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -11452,7 +11528,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -11478,7 +11556,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_ret_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -11735,6 +11815,7 @@ define amdgpu_kernel void @local_system_one_as_acquire_monotonic_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_system_one_as_acquire_monotonic_cmpxchg: @@ -11748,6 +11829,7 @@ define amdgpu_kernel void @local_system_one_as_acquire_monotonic_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_system_one_as_acquire_monotonic_cmpxchg: @@ -11785,6 +11867,7 @@ define amdgpu_kernel void @local_system_one_as_acquire_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_acquire_monotonic_cmpxchg: @@ -11797,6 +11880,7 @@ define amdgpu_kernel void @local_system_one_as_acquire_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_system_one_as_acquire_monotonic_cmpxchg: @@ -11821,6 +11905,7 @@ define amdgpu_kernel void @local_system_one_as_acquire_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_system_one_as_acquire_monotonic_cmpxchg: @@ -11901,6 +11986,7 @@ define amdgpu_kernel void @local_system_one_as_release_monotonic_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX6-NEXT: s_endpgm ; @@ -11914,6 +12000,7 @@ define amdgpu_kernel void @local_system_one_as_release_monotonic_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX7-NEXT: s_endpgm ; @@ -11951,6 +12038,7 @@ define amdgpu_kernel void @local_system_one_as_release_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -11963,6 +12051,7 @@ define amdgpu_kernel void @local_system_one_as_release_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -11987,6 +12076,7 @@ define amdgpu_kernel void @local_system_one_as_release_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -12068,7 +12158,9 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_monotonic_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_system_one_as_acq_rel_monotonic_cmpxchg: @@ -12081,7 +12173,9 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_monotonic_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_system_one_as_acq_rel_monotonic_cmpxchg: @@ -12118,7 +12212,9 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_acq_rel_monotonic_cmpxchg: @@ -12130,7 +12226,9 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_system_one_as_acq_rel_monotonic_cmpxchg: @@ -12154,7 +12252,9 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_system_one_as_acq_rel_monotonic_cmpxchg: @@ -12235,7 +12335,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_monotonic_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_system_one_as_seq_cst_monotonic_cmpxchg: @@ -12248,7 +12350,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_monotonic_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_system_one_as_seq_cst_monotonic_cmpxchg: @@ -12285,7 +12389,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_seq_cst_monotonic_cmpxchg: @@ -12297,7 +12403,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_system_one_as_seq_cst_monotonic_cmpxchg: @@ -12321,7 +12429,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_system_one_as_seq_cst_monotonic_cmpxchg: @@ -12403,6 +12513,7 @@ define amdgpu_kernel void @local_system_one_as_monotonic_acquire_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_system_one_as_monotonic_acquire_cmpxchg: @@ -12416,6 +12527,7 @@ define amdgpu_kernel void @local_system_one_as_monotonic_acquire_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_system_one_as_monotonic_acquire_cmpxchg: @@ -12453,6 +12565,7 @@ define amdgpu_kernel void @local_system_one_as_monotonic_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_monotonic_acquire_cmpxchg: @@ -12465,6 +12578,7 @@ define amdgpu_kernel void @local_system_one_as_monotonic_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_system_one_as_monotonic_acquire_cmpxchg: @@ -12489,6 +12603,7 @@ define amdgpu_kernel void @local_system_one_as_monotonic_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_system_one_as_monotonic_acquire_cmpxchg: @@ -12570,6 +12685,7 @@ define amdgpu_kernel void @local_system_one_as_acquire_acquire_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_system_one_as_acquire_acquire_cmpxchg: @@ -12583,6 +12699,7 @@ define amdgpu_kernel void @local_system_one_as_acquire_acquire_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_system_one_as_acquire_acquire_cmpxchg: @@ -12620,6 +12737,7 @@ define amdgpu_kernel void @local_system_one_as_acquire_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_acquire_acquire_cmpxchg: @@ -12632,6 +12750,7 @@ define amdgpu_kernel void @local_system_one_as_acquire_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_system_one_as_acquire_acquire_cmpxchg: @@ -12656,6 +12775,7 @@ define amdgpu_kernel void @local_system_one_as_acquire_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_system_one_as_acquire_acquire_cmpxchg: @@ -12736,7 +12856,9 @@ define amdgpu_kernel void @local_system_one_as_release_acquire_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_system_one_as_release_acquire_cmpxchg: @@ -12749,7 +12871,9 @@ define amdgpu_kernel void @local_system_one_as_release_acquire_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_system_one_as_release_acquire_cmpxchg: @@ -12786,7 +12910,9 @@ define amdgpu_kernel void @local_system_one_as_release_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_release_acquire_cmpxchg: @@ -12798,7 +12924,9 @@ define amdgpu_kernel void @local_system_one_as_release_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_system_one_as_release_acquire_cmpxchg: @@ -12822,7 +12950,9 @@ define amdgpu_kernel void @local_system_one_as_release_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_system_one_as_release_acquire_cmpxchg: @@ -12903,7 +13033,9 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_acquire_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_system_one_as_acq_rel_acquire_cmpxchg: @@ -12916,7 +13048,9 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_acquire_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_system_one_as_acq_rel_acquire_cmpxchg: @@ -12953,7 +13087,9 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_acq_rel_acquire_cmpxchg: @@ -12965,7 +13101,9 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_system_one_as_acq_rel_acquire_cmpxchg: @@ -12989,7 +13127,9 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_system_one_as_acq_rel_acquire_cmpxchg: @@ -13070,7 +13210,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_acquire_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_system_one_as_seq_cst_acquire_cmpxchg: @@ -13083,7 +13225,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_acquire_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_system_one_as_seq_cst_acquire_cmpxchg: @@ -13120,7 +13264,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_seq_cst_acquire_cmpxchg: @@ -13132,7 +13278,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_system_one_as_seq_cst_acquire_cmpxchg: @@ -13156,7 +13304,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_system_one_as_seq_cst_acquire_cmpxchg: @@ -13237,7 +13387,9 @@ define amdgpu_kernel void @local_system_one_as_monotonic_seq_cst_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_system_one_as_monotonic_seq_cst_cmpxchg: @@ -13250,7 +13402,9 @@ define amdgpu_kernel void @local_system_one_as_monotonic_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_system_one_as_monotonic_seq_cst_cmpxchg: @@ -13287,7 +13441,9 @@ define amdgpu_kernel void @local_system_one_as_monotonic_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_monotonic_seq_cst_cmpxchg: @@ -13299,7 +13455,9 @@ define amdgpu_kernel void @local_system_one_as_monotonic_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_system_one_as_monotonic_seq_cst_cmpxchg: @@ -13323,7 +13481,9 @@ define amdgpu_kernel void @local_system_one_as_monotonic_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_system_one_as_monotonic_seq_cst_cmpxchg: @@ -13404,7 +13564,9 @@ define amdgpu_kernel void @local_system_one_as_acquire_seq_cst_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_system_one_as_acquire_seq_cst_cmpxchg: @@ -13417,7 +13579,9 @@ define amdgpu_kernel void @local_system_one_as_acquire_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_system_one_as_acquire_seq_cst_cmpxchg: @@ -13454,7 +13618,9 @@ define amdgpu_kernel void @local_system_one_as_acquire_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_acquire_seq_cst_cmpxchg: @@ -13466,7 +13632,9 @@ define amdgpu_kernel void @local_system_one_as_acquire_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_system_one_as_acquire_seq_cst_cmpxchg: @@ -13490,7 +13658,9 @@ define amdgpu_kernel void @local_system_one_as_acquire_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_system_one_as_acquire_seq_cst_cmpxchg: @@ -13571,7 +13741,9 @@ define amdgpu_kernel void @local_system_one_as_release_seq_cst_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_system_one_as_release_seq_cst_cmpxchg: @@ -13584,7 +13756,9 @@ define amdgpu_kernel void @local_system_one_as_release_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_system_one_as_release_seq_cst_cmpxchg: @@ -13621,7 +13795,9 @@ define amdgpu_kernel void @local_system_one_as_release_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_release_seq_cst_cmpxchg: @@ -13633,7 +13809,9 @@ define amdgpu_kernel void @local_system_one_as_release_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_system_one_as_release_seq_cst_cmpxchg: @@ -13657,7 +13835,9 @@ define amdgpu_kernel void @local_system_one_as_release_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_system_one_as_release_seq_cst_cmpxchg: @@ -13738,7 +13918,9 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_seq_cst_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_system_one_as_acq_rel_seq_cst_cmpxchg: @@ -13751,7 +13933,9 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_system_one_as_acq_rel_seq_cst_cmpxchg: @@ -13788,7 +13972,9 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_acq_rel_seq_cst_cmpxchg: @@ -13800,7 +13986,9 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_system_one_as_acq_rel_seq_cst_cmpxchg: @@ -13824,7 +14012,9 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_system_one_as_acq_rel_seq_cst_cmpxchg: @@ -13905,7 +14095,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_seq_cst_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_system_one_as_seq_cst_seq_cst_cmpxchg: @@ -13918,7 +14110,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_system_one_as_seq_cst_seq_cst_cmpxchg: @@ -13955,7 +14149,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_seq_cst_seq_cst_cmpxchg: @@ -13967,7 +14163,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_system_one_as_seq_cst_seq_cst_cmpxchg: @@ -13991,7 +14189,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_system_one_as_seq_cst_seq_cst_cmpxchg: @@ -14284,6 +14484,7 @@ define amdgpu_kernel void @local_system_one_as_acquire_monotonic_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -14301,6 +14502,7 @@ define amdgpu_kernel void @local_system_one_as_acquire_monotonic_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -14348,6 +14550,7 @@ define amdgpu_kernel void @local_system_one_as_acquire_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -14364,6 +14567,7 @@ define amdgpu_kernel void @local_system_one_as_acquire_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -14394,6 +14598,7 @@ define amdgpu_kernel void @local_system_one_as_acquire_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -14494,6 +14699,7 @@ define amdgpu_kernel void @local_system_one_as_release_monotonic_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 @@ -14511,6 +14717,7 @@ define amdgpu_kernel void @local_system_one_as_release_monotonic_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -14558,6 +14765,7 @@ define amdgpu_kernel void @local_system_one_as_release_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 @@ -14574,6 +14782,7 @@ define amdgpu_kernel void @local_system_one_as_release_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -14604,6 +14813,7 @@ define amdgpu_kernel void @local_system_one_as_release_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -14705,7 +14915,9 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -14722,7 +14934,9 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -14769,7 +14983,9 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -14785,7 +15001,9 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -14815,7 +15033,9 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -14916,7 +15136,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -14933,7 +15155,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -14980,7 +15204,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -14996,7 +15222,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -15026,7 +15254,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -15128,6 +15358,7 @@ define amdgpu_kernel void @local_system_one_as_monotonic_acquire_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -15145,6 +15376,7 @@ define amdgpu_kernel void @local_system_one_as_monotonic_acquire_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -15192,6 +15424,7 @@ define amdgpu_kernel void @local_system_one_as_monotonic_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -15208,6 +15441,7 @@ define amdgpu_kernel void @local_system_one_as_monotonic_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -15238,6 +15472,7 @@ define amdgpu_kernel void @local_system_one_as_monotonic_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -15339,6 +15574,7 @@ define amdgpu_kernel void @local_system_one_as_acquire_acquire_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -15356,6 +15592,7 @@ define amdgpu_kernel void @local_system_one_as_acquire_acquire_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -15403,6 +15640,7 @@ define amdgpu_kernel void @local_system_one_as_acquire_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -15419,6 +15657,7 @@ define amdgpu_kernel void @local_system_one_as_acquire_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -15449,6 +15688,7 @@ define amdgpu_kernel void @local_system_one_as_acquire_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -15549,7 +15789,9 @@ define amdgpu_kernel void @local_system_one_as_release_acquire_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -15566,7 +15808,9 @@ define amdgpu_kernel void @local_system_one_as_release_acquire_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -15613,7 +15857,9 @@ define amdgpu_kernel void @local_system_one_as_release_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -15629,7 +15875,9 @@ define amdgpu_kernel void @local_system_one_as_release_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -15659,7 +15907,9 @@ define amdgpu_kernel void @local_system_one_as_release_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -15760,7 +16010,9 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -15777,7 +16029,9 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -15824,7 +16078,9 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -15840,7 +16096,9 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -15870,7 +16128,9 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -15971,7 +16231,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -15988,7 +16250,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -16035,7 +16299,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -16051,7 +16317,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -16081,7 +16349,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -16182,7 +16452,9 @@ define amdgpu_kernel void @local_system_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -16199,7 +16471,9 @@ define amdgpu_kernel void @local_system_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -16246,7 +16520,9 @@ define amdgpu_kernel void @local_system_one_as_monotonic_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -16262,7 +16538,9 @@ define amdgpu_kernel void @local_system_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -16292,7 +16570,9 @@ define amdgpu_kernel void @local_system_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -16393,7 +16673,9 @@ define amdgpu_kernel void @local_system_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -16410,7 +16692,9 @@ define amdgpu_kernel void @local_system_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -16457,7 +16741,9 @@ define amdgpu_kernel void @local_system_one_as_acquire_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -16473,7 +16759,9 @@ define amdgpu_kernel void @local_system_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -16503,7 +16791,9 @@ define amdgpu_kernel void @local_system_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -16604,7 +16894,9 @@ define amdgpu_kernel void @local_system_one_as_release_seq_cst_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -16621,7 +16913,9 @@ define amdgpu_kernel void @local_system_one_as_release_seq_cst_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -16668,7 +16962,9 @@ define amdgpu_kernel void @local_system_one_as_release_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -16684,7 +16980,9 @@ define amdgpu_kernel void @local_system_one_as_release_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -16714,7 +17012,9 @@ define amdgpu_kernel void @local_system_one_as_release_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -16815,7 +17115,9 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -16832,7 +17134,9 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -16879,7 +17183,9 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -16895,7 +17201,9 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -16925,7 +17233,9 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -17026,7 +17336,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -17043,7 +17355,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -17090,7 +17404,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -17106,7 +17422,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -17136,7 +17454,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll index bc2508411ed6b..590dedc85d9c4 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll @@ -28,7 +28,7 @@ define amdgpu_kernel void @local_volatile_load_0( ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: ds_read_b32 v0, v0 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; @@ -43,6 +43,7 @@ define amdgpu_kernel void @local_volatile_load_0( ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ds_read_b32 v2, v0 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -90,7 +91,7 @@ define amdgpu_kernel void @local_volatile_load_0( ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 ; SKIP-CACHE-INV-NEXT: ds_read_b32 v0, v0 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -169,7 +170,7 @@ define amdgpu_kernel void @local_volatile_load_1( ; GFX6-NEXT: v_add_i32_e64 v0, s[4:5], s4, v0 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: ds_read_b32 v0, v0 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; @@ -186,6 +187,7 @@ define amdgpu_kernel void @local_volatile_load_1( ; GFX7-NEXT: v_add_i32_e64 v0, s[6:7], s6, v0 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: ds_read_b32 v2, v0 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -239,7 +241,7 @@ define amdgpu_kernel void @local_volatile_load_1( ; SKIP-CACHE-INV-NEXT: v_add_i32_e64 v0, s[4:5], s4, v0 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: ds_read_b32 v0, v0 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -330,6 +332,7 @@ define amdgpu_kernel void @local_volatile_store_0( ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s0 ; GFX6-NEXT: ds_write_b32 v0, v1 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_volatile_store_0: @@ -343,6 +346,7 @@ define amdgpu_kernel void @local_volatile_store_0( ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s4 ; GFX7-NEXT: ds_write_b32 v0, v1 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_volatile_store_0: @@ -380,6 +384,7 @@ define amdgpu_kernel void @local_volatile_store_0( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: local_volatile_store_0: @@ -461,6 +466,7 @@ define amdgpu_kernel void @local_volatile_store_1( ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s0 ; GFX6-NEXT: ds_write_b32 v0, v1 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_volatile_store_1: @@ -476,6 +482,7 @@ define amdgpu_kernel void @local_volatile_store_1( ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s4 ; GFX7-NEXT: ds_write_b32 v0, v1 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_volatile_store_1: @@ -517,6 +524,7 @@ define amdgpu_kernel void @local_volatile_store_1( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: local_volatile_store_1: @@ -610,7 +618,7 @@ define amdgpu_kernel void @local_volatile_workgroup_acquire_load( ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s1 ; GFX6-NEXT: ds_read_b32 v1, v0 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s0 ; GFX6-NEXT: ds_write_b32 v0, v1 @@ -624,7 +632,7 @@ define amdgpu_kernel void @local_volatile_workgroup_acquire_load( ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: ds_read_b32 v1, v0 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: ds_write_b32 v0, v1 @@ -663,7 +671,7 @@ define amdgpu_kernel void @local_volatile_workgroup_acquire_load( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: ds_read_b32 v1, v0 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 @@ -735,7 +743,7 @@ define amdgpu_kernel void @local_volatile_workgroup_release_store( ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s1 ; GFX6-NEXT: v_mov_b32_e32 v1, s0 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm ; @@ -747,7 +755,7 @@ define amdgpu_kernel void @local_volatile_workgroup_release_store( ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: v_mov_b32_e32 v1, s4 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: ds_write_b32 v0, v1 ; GFX7-NEXT: s_endpgm ; @@ -782,7 +790,7 @@ define amdgpu_kernel void @local_volatile_workgroup_release_store( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-wavefront.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-wavefront.ll index b24622a48a16b..148f3ed0ec152 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-wavefront.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-wavefront.ll @@ -366,6 +366,7 @@ define amdgpu_kernel void @local_wavefront_acquire_load( ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: ds_read_b32 v1, v0 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -380,6 +381,7 @@ define amdgpu_kernel void @local_wavefront_acquire_load( ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: ds_read_b32 v1, v0 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -418,6 +420,7 @@ define amdgpu_kernel void @local_wavefront_acquire_load( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: ds_read_b32 v1, v0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -431,6 +434,7 @@ define amdgpu_kernel void @local_wavefront_acquire_load( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: ds_read_b32 v1, v0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -455,6 +459,7 @@ define amdgpu_kernel void @local_wavefront_acquire_load( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: ds_read_b32 v1, v0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -536,7 +541,9 @@ define amdgpu_kernel void @local_wavefront_seq_cst_load( ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_read_b32 v1, v0 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -550,7 +557,9 @@ define amdgpu_kernel void @local_wavefront_seq_cst_load( ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_read_b32 v1, v0 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -588,7 +597,9 @@ define amdgpu_kernel void @local_wavefront_seq_cst_load( ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_read_b32 v1, v0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -601,7 +612,9 @@ define amdgpu_kernel void @local_wavefront_seq_cst_load( ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_read_b32 v1, v0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -625,7 +638,9 @@ define amdgpu_kernel void @local_wavefront_seq_cst_load( ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_read_b32 v1, v0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -987,6 +1002,7 @@ define amdgpu_kernel void @local_wavefront_release_store( ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm ; @@ -998,6 +1014,7 @@ define amdgpu_kernel void @local_wavefront_release_store( ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: v_mov_b32_e32 v1, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_write_b32 v0, v1 ; GFX7-NEXT: s_endpgm ; @@ -1029,6 +1046,7 @@ define amdgpu_kernel void @local_wavefront_release_store( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -1039,6 +1057,7 @@ define amdgpu_kernel void @local_wavefront_release_store( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -1059,6 +1078,7 @@ define amdgpu_kernel void @local_wavefront_release_store( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -1127,6 +1147,7 @@ define amdgpu_kernel void @local_wavefront_seq_cst_store( ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm ; @@ -1138,6 +1159,7 @@ define amdgpu_kernel void @local_wavefront_seq_cst_store( ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: v_mov_b32_e32 v1, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_write_b32 v0, v1 ; GFX7-NEXT: s_endpgm ; @@ -1169,6 +1191,7 @@ define amdgpu_kernel void @local_wavefront_seq_cst_store( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -1179,6 +1202,7 @@ define amdgpu_kernel void @local_wavefront_seq_cst_store( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -1199,6 +1223,7 @@ define amdgpu_kernel void @local_wavefront_seq_cst_store( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -1408,6 +1433,7 @@ define amdgpu_kernel void @local_wavefront_acquire_atomicrmw( ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_wavefront_acquire_atomicrmw: @@ -1419,6 +1445,7 @@ define amdgpu_kernel void @local_wavefront_acquire_atomicrmw( ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: v_mov_b32_e32 v1, s4 ; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_wavefront_acquire_atomicrmw: @@ -1450,6 +1477,7 @@ define amdgpu_kernel void @local_wavefront_acquire_atomicrmw( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_acquire_atomicrmw: @@ -1460,6 +1488,7 @@ define amdgpu_kernel void @local_wavefront_acquire_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_wavefront_acquire_atomicrmw: @@ -1480,6 +1509,7 @@ define amdgpu_kernel void @local_wavefront_acquire_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_wavefront_acquire_atomicrmw: @@ -1547,6 +1577,7 @@ define amdgpu_kernel void @local_wavefront_release_atomicrmw( ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX6-NEXT: s_endpgm ; @@ -1558,6 +1589,7 @@ define amdgpu_kernel void @local_wavefront_release_atomicrmw( ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: v_mov_b32_e32 v1, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX7-NEXT: s_endpgm ; @@ -1589,6 +1621,7 @@ define amdgpu_kernel void @local_wavefront_release_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -1599,6 +1632,7 @@ define amdgpu_kernel void @local_wavefront_release_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -1619,6 +1653,7 @@ define amdgpu_kernel void @local_wavefront_release_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -1687,7 +1722,9 @@ define amdgpu_kernel void @local_wavefront_acq_rel_atomicrmw( ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_wavefront_acq_rel_atomicrmw: @@ -1698,7 +1735,9 @@ define amdgpu_kernel void @local_wavefront_acq_rel_atomicrmw( ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: v_mov_b32_e32 v1, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_wavefront_acq_rel_atomicrmw: @@ -1729,7 +1768,9 @@ define amdgpu_kernel void @local_wavefront_acq_rel_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_acq_rel_atomicrmw: @@ -1739,7 +1780,9 @@ define amdgpu_kernel void @local_wavefront_acq_rel_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_wavefront_acq_rel_atomicrmw: @@ -1759,7 +1802,9 @@ define amdgpu_kernel void @local_wavefront_acq_rel_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_wavefront_acq_rel_atomicrmw: @@ -1827,7 +1872,9 @@ define amdgpu_kernel void @local_wavefront_seq_cst_atomicrmw( ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_wavefront_seq_cst_atomicrmw: @@ -1838,7 +1885,9 @@ define amdgpu_kernel void @local_wavefront_seq_cst_atomicrmw( ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: v_mov_b32_e32 v1, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_wavefront_seq_cst_atomicrmw: @@ -1869,7 +1918,9 @@ define amdgpu_kernel void @local_wavefront_seq_cst_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_seq_cst_atomicrmw: @@ -1879,7 +1930,9 @@ define amdgpu_kernel void @local_wavefront_seq_cst_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_wavefront_seq_cst_atomicrmw: @@ -1899,7 +1952,9 @@ define amdgpu_kernel void @local_wavefront_seq_cst_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_wavefront_seq_cst_atomicrmw: @@ -1968,6 +2023,7 @@ define amdgpu_kernel void @local_wavefront_acquire_ret_atomicrmw( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -1983,6 +2039,7 @@ define amdgpu_kernel void @local_wavefront_acquire_ret_atomicrmw( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -2024,6 +2081,7 @@ define amdgpu_kernel void @local_wavefront_acquire_ret_atomicrmw( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -2038,6 +2096,7 @@ define amdgpu_kernel void @local_wavefront_acquire_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -2064,6 +2123,7 @@ define amdgpu_kernel void @local_wavefront_acquire_ret_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -2150,7 +2210,9 @@ define amdgpu_kernel void @local_wavefront_acq_rel_ret_atomicrmw( ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -2165,7 +2227,9 @@ define amdgpu_kernel void @local_wavefront_acq_rel_ret_atomicrmw( ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -2206,7 +2270,9 @@ define amdgpu_kernel void @local_wavefront_acq_rel_ret_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -2220,7 +2286,9 @@ define amdgpu_kernel void @local_wavefront_acq_rel_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -2246,7 +2314,9 @@ define amdgpu_kernel void @local_wavefront_acq_rel_ret_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -2333,7 +2403,9 @@ define amdgpu_kernel void @local_wavefront_seq_cst_ret_atomicrmw( ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -2348,7 +2420,9 @@ define amdgpu_kernel void @local_wavefront_seq_cst_ret_atomicrmw( ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -2389,7 +2463,9 @@ define amdgpu_kernel void @local_wavefront_seq_cst_ret_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -2403,7 +2479,9 @@ define amdgpu_kernel void @local_wavefront_seq_cst_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -2429,7 +2507,9 @@ define amdgpu_kernel void @local_wavefront_seq_cst_ret_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -2686,6 +2766,7 @@ define amdgpu_kernel void @local_wavefront_acquire_monotonic_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_wavefront_acquire_monotonic_cmpxchg: @@ -2699,6 +2780,7 @@ define amdgpu_kernel void @local_wavefront_acquire_monotonic_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_wavefront_acquire_monotonic_cmpxchg: @@ -2736,6 +2818,7 @@ define amdgpu_kernel void @local_wavefront_acquire_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_acquire_monotonic_cmpxchg: @@ -2748,6 +2831,7 @@ define amdgpu_kernel void @local_wavefront_acquire_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_wavefront_acquire_monotonic_cmpxchg: @@ -2772,6 +2856,7 @@ define amdgpu_kernel void @local_wavefront_acquire_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_wavefront_acquire_monotonic_cmpxchg: @@ -2852,6 +2937,7 @@ define amdgpu_kernel void @local_wavefront_release_monotonic_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX6-NEXT: s_endpgm ; @@ -2865,6 +2951,7 @@ define amdgpu_kernel void @local_wavefront_release_monotonic_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX7-NEXT: s_endpgm ; @@ -2902,6 +2989,7 @@ define amdgpu_kernel void @local_wavefront_release_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -2914,6 +3002,7 @@ define amdgpu_kernel void @local_wavefront_release_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -2938,6 +3027,7 @@ define amdgpu_kernel void @local_wavefront_release_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -3019,7 +3109,9 @@ define amdgpu_kernel void @local_wavefront_acq_rel_monotonic_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_wavefront_acq_rel_monotonic_cmpxchg: @@ -3032,7 +3124,9 @@ define amdgpu_kernel void @local_wavefront_acq_rel_monotonic_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_wavefront_acq_rel_monotonic_cmpxchg: @@ -3069,7 +3163,9 @@ define amdgpu_kernel void @local_wavefront_acq_rel_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_acq_rel_monotonic_cmpxchg: @@ -3081,7 +3177,9 @@ define amdgpu_kernel void @local_wavefront_acq_rel_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_wavefront_acq_rel_monotonic_cmpxchg: @@ -3105,7 +3203,9 @@ define amdgpu_kernel void @local_wavefront_acq_rel_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_wavefront_acq_rel_monotonic_cmpxchg: @@ -3186,7 +3286,9 @@ define amdgpu_kernel void @local_wavefront_seq_cst_monotonic_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_wavefront_seq_cst_monotonic_cmpxchg: @@ -3199,7 +3301,9 @@ define amdgpu_kernel void @local_wavefront_seq_cst_monotonic_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_wavefront_seq_cst_monotonic_cmpxchg: @@ -3236,7 +3340,9 @@ define amdgpu_kernel void @local_wavefront_seq_cst_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_seq_cst_monotonic_cmpxchg: @@ -3248,7 +3354,9 @@ define amdgpu_kernel void @local_wavefront_seq_cst_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_wavefront_seq_cst_monotonic_cmpxchg: @@ -3272,7 +3380,9 @@ define amdgpu_kernel void @local_wavefront_seq_cst_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_wavefront_seq_cst_monotonic_cmpxchg: @@ -3354,6 +3464,7 @@ define amdgpu_kernel void @local_wavefront_monotonic_acquire_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_wavefront_monotonic_acquire_cmpxchg: @@ -3367,6 +3478,7 @@ define amdgpu_kernel void @local_wavefront_monotonic_acquire_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_wavefront_monotonic_acquire_cmpxchg: @@ -3404,6 +3516,7 @@ define amdgpu_kernel void @local_wavefront_monotonic_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_monotonic_acquire_cmpxchg: @@ -3416,6 +3529,7 @@ define amdgpu_kernel void @local_wavefront_monotonic_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_wavefront_monotonic_acquire_cmpxchg: @@ -3440,6 +3554,7 @@ define amdgpu_kernel void @local_wavefront_monotonic_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_wavefront_monotonic_acquire_cmpxchg: @@ -3521,6 +3636,7 @@ define amdgpu_kernel void @local_wavefront_acquire_acquire_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_wavefront_acquire_acquire_cmpxchg: @@ -3534,6 +3650,7 @@ define amdgpu_kernel void @local_wavefront_acquire_acquire_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_wavefront_acquire_acquire_cmpxchg: @@ -3571,6 +3688,7 @@ define amdgpu_kernel void @local_wavefront_acquire_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_acquire_acquire_cmpxchg: @@ -3583,6 +3701,7 @@ define amdgpu_kernel void @local_wavefront_acquire_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_wavefront_acquire_acquire_cmpxchg: @@ -3607,6 +3726,7 @@ define amdgpu_kernel void @local_wavefront_acquire_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_wavefront_acquire_acquire_cmpxchg: @@ -3687,7 +3807,9 @@ define amdgpu_kernel void @local_wavefront_release_acquire_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_wavefront_release_acquire_cmpxchg: @@ -3700,7 +3822,9 @@ define amdgpu_kernel void @local_wavefront_release_acquire_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_wavefront_release_acquire_cmpxchg: @@ -3737,7 +3861,9 @@ define amdgpu_kernel void @local_wavefront_release_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_release_acquire_cmpxchg: @@ -3749,7 +3875,9 @@ define amdgpu_kernel void @local_wavefront_release_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_wavefront_release_acquire_cmpxchg: @@ -3773,7 +3901,9 @@ define amdgpu_kernel void @local_wavefront_release_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_wavefront_release_acquire_cmpxchg: @@ -3854,7 +3984,9 @@ define amdgpu_kernel void @local_wavefront_acq_rel_acquire_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_wavefront_acq_rel_acquire_cmpxchg: @@ -3867,7 +3999,9 @@ define amdgpu_kernel void @local_wavefront_acq_rel_acquire_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_wavefront_acq_rel_acquire_cmpxchg: @@ -3904,7 +4038,9 @@ define amdgpu_kernel void @local_wavefront_acq_rel_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_acq_rel_acquire_cmpxchg: @@ -3916,7 +4052,9 @@ define amdgpu_kernel void @local_wavefront_acq_rel_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_wavefront_acq_rel_acquire_cmpxchg: @@ -3940,7 +4078,9 @@ define amdgpu_kernel void @local_wavefront_acq_rel_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_wavefront_acq_rel_acquire_cmpxchg: @@ -4021,7 +4161,9 @@ define amdgpu_kernel void @local_wavefront_seq_cst_acquire_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_wavefront_seq_cst_acquire_cmpxchg: @@ -4034,7 +4176,9 @@ define amdgpu_kernel void @local_wavefront_seq_cst_acquire_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_wavefront_seq_cst_acquire_cmpxchg: @@ -4071,7 +4215,9 @@ define amdgpu_kernel void @local_wavefront_seq_cst_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_seq_cst_acquire_cmpxchg: @@ -4083,7 +4229,9 @@ define amdgpu_kernel void @local_wavefront_seq_cst_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_wavefront_seq_cst_acquire_cmpxchg: @@ -4107,7 +4255,9 @@ define amdgpu_kernel void @local_wavefront_seq_cst_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_wavefront_seq_cst_acquire_cmpxchg: @@ -4188,7 +4338,9 @@ define amdgpu_kernel void @local_wavefront_monotonic_seq_cst_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_wavefront_monotonic_seq_cst_cmpxchg: @@ -4201,7 +4353,9 @@ define amdgpu_kernel void @local_wavefront_monotonic_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_wavefront_monotonic_seq_cst_cmpxchg: @@ -4238,7 +4392,9 @@ define amdgpu_kernel void @local_wavefront_monotonic_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_monotonic_seq_cst_cmpxchg: @@ -4250,7 +4406,9 @@ define amdgpu_kernel void @local_wavefront_monotonic_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_wavefront_monotonic_seq_cst_cmpxchg: @@ -4274,7 +4432,9 @@ define amdgpu_kernel void @local_wavefront_monotonic_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_wavefront_monotonic_seq_cst_cmpxchg: @@ -4355,7 +4515,9 @@ define amdgpu_kernel void @local_wavefront_acquire_seq_cst_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_wavefront_acquire_seq_cst_cmpxchg: @@ -4368,7 +4530,9 @@ define amdgpu_kernel void @local_wavefront_acquire_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_wavefront_acquire_seq_cst_cmpxchg: @@ -4405,7 +4569,9 @@ define amdgpu_kernel void @local_wavefront_acquire_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_acquire_seq_cst_cmpxchg: @@ -4417,7 +4583,9 @@ define amdgpu_kernel void @local_wavefront_acquire_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_wavefront_acquire_seq_cst_cmpxchg: @@ -4441,7 +4609,9 @@ define amdgpu_kernel void @local_wavefront_acquire_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_wavefront_acquire_seq_cst_cmpxchg: @@ -4522,7 +4692,9 @@ define amdgpu_kernel void @local_wavefront_release_seq_cst_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_wavefront_release_seq_cst_cmpxchg: @@ -4535,7 +4707,9 @@ define amdgpu_kernel void @local_wavefront_release_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_wavefront_release_seq_cst_cmpxchg: @@ -4572,7 +4746,9 @@ define amdgpu_kernel void @local_wavefront_release_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_release_seq_cst_cmpxchg: @@ -4584,7 +4760,9 @@ define amdgpu_kernel void @local_wavefront_release_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_wavefront_release_seq_cst_cmpxchg: @@ -4608,7 +4786,9 @@ define amdgpu_kernel void @local_wavefront_release_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_wavefront_release_seq_cst_cmpxchg: @@ -4689,7 +4869,9 @@ define amdgpu_kernel void @local_wavefront_acq_rel_seq_cst_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_wavefront_acq_rel_seq_cst_cmpxchg: @@ -4702,7 +4884,9 @@ define amdgpu_kernel void @local_wavefront_acq_rel_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_wavefront_acq_rel_seq_cst_cmpxchg: @@ -4739,7 +4923,9 @@ define amdgpu_kernel void @local_wavefront_acq_rel_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_acq_rel_seq_cst_cmpxchg: @@ -4751,7 +4937,9 @@ define amdgpu_kernel void @local_wavefront_acq_rel_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_wavefront_acq_rel_seq_cst_cmpxchg: @@ -4775,7 +4963,9 @@ define amdgpu_kernel void @local_wavefront_acq_rel_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_wavefront_acq_rel_seq_cst_cmpxchg: @@ -4856,7 +5046,9 @@ define amdgpu_kernel void @local_wavefront_seq_cst_seq_cst_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_wavefront_seq_cst_seq_cst_cmpxchg: @@ -4869,7 +5061,9 @@ define amdgpu_kernel void @local_wavefront_seq_cst_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_wavefront_seq_cst_seq_cst_cmpxchg: @@ -4906,7 +5100,9 @@ define amdgpu_kernel void @local_wavefront_seq_cst_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_seq_cst_seq_cst_cmpxchg: @@ -4918,7 +5114,9 @@ define amdgpu_kernel void @local_wavefront_seq_cst_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_wavefront_seq_cst_seq_cst_cmpxchg: @@ -4942,7 +5140,9 @@ define amdgpu_kernel void @local_wavefront_seq_cst_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_wavefront_seq_cst_seq_cst_cmpxchg: @@ -5235,6 +5435,7 @@ define amdgpu_kernel void @local_wavefront_acquire_monotonic_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -5252,6 +5453,7 @@ define amdgpu_kernel void @local_wavefront_acquire_monotonic_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -5299,6 +5501,7 @@ define amdgpu_kernel void @local_wavefront_acquire_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -5315,6 +5518,7 @@ define amdgpu_kernel void @local_wavefront_acquire_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -5345,6 +5549,7 @@ define amdgpu_kernel void @local_wavefront_acquire_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -5445,6 +5650,7 @@ define amdgpu_kernel void @local_wavefront_release_monotonic_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 @@ -5462,6 +5668,7 @@ define amdgpu_kernel void @local_wavefront_release_monotonic_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -5509,6 +5716,7 @@ define amdgpu_kernel void @local_wavefront_release_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 @@ -5525,6 +5733,7 @@ define amdgpu_kernel void @local_wavefront_release_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -5555,6 +5764,7 @@ define amdgpu_kernel void @local_wavefront_release_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -5656,7 +5866,9 @@ define amdgpu_kernel void @local_wavefront_acq_rel_monotonic_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -5673,7 +5885,9 @@ define amdgpu_kernel void @local_wavefront_acq_rel_monotonic_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -5720,7 +5934,9 @@ define amdgpu_kernel void @local_wavefront_acq_rel_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -5736,7 +5952,9 @@ define amdgpu_kernel void @local_wavefront_acq_rel_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -5766,7 +5984,9 @@ define amdgpu_kernel void @local_wavefront_acq_rel_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -5867,7 +6087,9 @@ define amdgpu_kernel void @local_wavefront_seq_cst_monotonic_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -5884,7 +6106,9 @@ define amdgpu_kernel void @local_wavefront_seq_cst_monotonic_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -5931,7 +6155,9 @@ define amdgpu_kernel void @local_wavefront_seq_cst_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -5947,7 +6173,9 @@ define amdgpu_kernel void @local_wavefront_seq_cst_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -5977,7 +6205,9 @@ define amdgpu_kernel void @local_wavefront_seq_cst_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -6079,6 +6309,7 @@ define amdgpu_kernel void @local_wavefront_monotonic_acquire_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -6096,6 +6327,7 @@ define amdgpu_kernel void @local_wavefront_monotonic_acquire_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -6143,6 +6375,7 @@ define amdgpu_kernel void @local_wavefront_monotonic_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -6159,6 +6392,7 @@ define amdgpu_kernel void @local_wavefront_monotonic_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -6189,6 +6423,7 @@ define amdgpu_kernel void @local_wavefront_monotonic_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -6290,6 +6525,7 @@ define amdgpu_kernel void @local_wavefront_acquire_acquire_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -6307,6 +6543,7 @@ define amdgpu_kernel void @local_wavefront_acquire_acquire_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -6354,6 +6591,7 @@ define amdgpu_kernel void @local_wavefront_acquire_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -6370,6 +6608,7 @@ define amdgpu_kernel void @local_wavefront_acquire_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -6400,6 +6639,7 @@ define amdgpu_kernel void @local_wavefront_acquire_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -6500,7 +6740,9 @@ define amdgpu_kernel void @local_wavefront_release_acquire_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -6517,7 +6759,9 @@ define amdgpu_kernel void @local_wavefront_release_acquire_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -6564,7 +6808,9 @@ define amdgpu_kernel void @local_wavefront_release_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -6580,7 +6826,9 @@ define amdgpu_kernel void @local_wavefront_release_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -6610,7 +6858,9 @@ define amdgpu_kernel void @local_wavefront_release_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -6711,7 +6961,9 @@ define amdgpu_kernel void @local_wavefront_acq_rel_acquire_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -6728,7 +6980,9 @@ define amdgpu_kernel void @local_wavefront_acq_rel_acquire_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -6775,7 +7029,9 @@ define amdgpu_kernel void @local_wavefront_acq_rel_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -6791,7 +7047,9 @@ define amdgpu_kernel void @local_wavefront_acq_rel_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -6821,7 +7079,9 @@ define amdgpu_kernel void @local_wavefront_acq_rel_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -6922,7 +7182,9 @@ define amdgpu_kernel void @local_wavefront_seq_cst_acquire_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -6939,7 +7201,9 @@ define amdgpu_kernel void @local_wavefront_seq_cst_acquire_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -6986,7 +7250,9 @@ define amdgpu_kernel void @local_wavefront_seq_cst_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -7002,7 +7268,9 @@ define amdgpu_kernel void @local_wavefront_seq_cst_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -7032,7 +7300,9 @@ define amdgpu_kernel void @local_wavefront_seq_cst_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -7133,7 +7403,9 @@ define amdgpu_kernel void @local_wavefront_monotonic_seq_cst_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -7150,7 +7422,9 @@ define amdgpu_kernel void @local_wavefront_monotonic_seq_cst_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -7197,7 +7471,9 @@ define amdgpu_kernel void @local_wavefront_monotonic_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -7213,7 +7489,9 @@ define amdgpu_kernel void @local_wavefront_monotonic_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -7243,7 +7521,9 @@ define amdgpu_kernel void @local_wavefront_monotonic_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -7344,7 +7624,9 @@ define amdgpu_kernel void @local_wavefront_acquire_seq_cst_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -7361,7 +7643,9 @@ define amdgpu_kernel void @local_wavefront_acquire_seq_cst_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -7408,7 +7692,9 @@ define amdgpu_kernel void @local_wavefront_acquire_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -7424,7 +7710,9 @@ define amdgpu_kernel void @local_wavefront_acquire_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -7454,7 +7742,9 @@ define amdgpu_kernel void @local_wavefront_acquire_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -7555,7 +7845,9 @@ define amdgpu_kernel void @local_wavefront_release_seq_cst_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -7572,7 +7864,9 @@ define amdgpu_kernel void @local_wavefront_release_seq_cst_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -7619,7 +7913,9 @@ define amdgpu_kernel void @local_wavefront_release_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -7635,7 +7931,9 @@ define amdgpu_kernel void @local_wavefront_release_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -7665,7 +7963,9 @@ define amdgpu_kernel void @local_wavefront_release_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -7766,7 +8066,9 @@ define amdgpu_kernel void @local_wavefront_acq_rel_seq_cst_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -7783,7 +8085,9 @@ define amdgpu_kernel void @local_wavefront_acq_rel_seq_cst_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -7830,7 +8134,9 @@ define amdgpu_kernel void @local_wavefront_acq_rel_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -7846,7 +8152,9 @@ define amdgpu_kernel void @local_wavefront_acq_rel_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -7876,7 +8184,9 @@ define amdgpu_kernel void @local_wavefront_acq_rel_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -7977,7 +8287,9 @@ define amdgpu_kernel void @local_wavefront_seq_cst_seq_cst_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -7994,7 +8306,9 @@ define amdgpu_kernel void @local_wavefront_seq_cst_seq_cst_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -8041,7 +8355,9 @@ define amdgpu_kernel void @local_wavefront_seq_cst_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -8057,7 +8373,9 @@ define amdgpu_kernel void @local_wavefront_seq_cst_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -8087,7 +8405,9 @@ define amdgpu_kernel void @local_wavefront_seq_cst_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -8529,6 +8849,7 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_load( ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: ds_read_b32 v1, v0 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -8543,6 +8864,7 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_load( ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: ds_read_b32 v1, v0 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -8581,6 +8903,7 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_load( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: ds_read_b32 v1, v0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -8594,6 +8917,7 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_load( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: ds_read_b32 v1, v0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -8618,6 +8942,7 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_load( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: ds_read_b32 v1, v0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -8699,7 +9024,9 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_load( ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_read_b32 v1, v0 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -8713,7 +9040,9 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_load( ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_read_b32 v1, v0 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -8751,7 +9080,9 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_load( ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_read_b32 v1, v0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -8764,7 +9095,9 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_load( ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_read_b32 v1, v0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -8788,7 +9121,9 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_load( ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_read_b32 v1, v0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -9150,6 +9485,7 @@ define amdgpu_kernel void @local_wavefront_one_as_release_store( ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm ; @@ -9161,6 +9497,7 @@ define amdgpu_kernel void @local_wavefront_one_as_release_store( ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: v_mov_b32_e32 v1, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_write_b32 v0, v1 ; GFX7-NEXT: s_endpgm ; @@ -9192,6 +9529,7 @@ define amdgpu_kernel void @local_wavefront_one_as_release_store( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -9202,6 +9540,7 @@ define amdgpu_kernel void @local_wavefront_one_as_release_store( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -9222,6 +9561,7 @@ define amdgpu_kernel void @local_wavefront_one_as_release_store( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -9290,6 +9630,7 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_store( ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm ; @@ -9301,6 +9642,7 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_store( ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: v_mov_b32_e32 v1, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_write_b32 v0, v1 ; GFX7-NEXT: s_endpgm ; @@ -9332,6 +9674,7 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_store( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -9342,6 +9685,7 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_store( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -9362,6 +9706,7 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_store( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -9571,6 +9916,7 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_atomicrmw( ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_wavefront_one_as_acquire_atomicrmw: @@ -9582,6 +9928,7 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_atomicrmw( ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: v_mov_b32_e32 v1, s4 ; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_wavefront_one_as_acquire_atomicrmw: @@ -9613,6 +9960,7 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_atomicrmw( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_acquire_atomicrmw: @@ -9623,6 +9971,7 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_acquire_atomicrmw: @@ -9643,6 +9992,7 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_wavefront_one_as_acquire_atomicrmw: @@ -9710,6 +10060,7 @@ define amdgpu_kernel void @local_wavefront_one_as_release_atomicrmw( ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX6-NEXT: s_endpgm ; @@ -9721,6 +10072,7 @@ define amdgpu_kernel void @local_wavefront_one_as_release_atomicrmw( ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: v_mov_b32_e32 v1, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX7-NEXT: s_endpgm ; @@ -9752,6 +10104,7 @@ define amdgpu_kernel void @local_wavefront_one_as_release_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -9762,6 +10115,7 @@ define amdgpu_kernel void @local_wavefront_one_as_release_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -9782,6 +10136,7 @@ define amdgpu_kernel void @local_wavefront_one_as_release_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -9850,7 +10205,9 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_atomicrmw( ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_wavefront_one_as_acq_rel_atomicrmw: @@ -9861,7 +10218,9 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_atomicrmw( ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: v_mov_b32_e32 v1, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_wavefront_one_as_acq_rel_atomicrmw: @@ -9892,7 +10251,9 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_acq_rel_atomicrmw: @@ -9902,7 +10263,9 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_acq_rel_atomicrmw: @@ -9922,7 +10285,9 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_wavefront_one_as_acq_rel_atomicrmw: @@ -9990,7 +10355,9 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_atomicrmw( ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_wavefront_one_as_seq_cst_atomicrmw: @@ -10001,7 +10368,9 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_atomicrmw( ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: v_mov_b32_e32 v1, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_wavefront_one_as_seq_cst_atomicrmw: @@ -10032,7 +10401,9 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_seq_cst_atomicrmw: @@ -10042,7 +10413,9 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_seq_cst_atomicrmw: @@ -10062,7 +10435,9 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_wavefront_one_as_seq_cst_atomicrmw: @@ -10131,6 +10506,7 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_ret_atomicrmw( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -10146,6 +10522,7 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_ret_atomicrmw( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -10187,6 +10564,7 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_ret_atomicrmw( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -10201,6 +10579,7 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -10227,6 +10606,7 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_ret_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -10313,7 +10693,9 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_ret_atomicrmw( ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -10328,7 +10710,9 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_ret_atomicrmw( ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -10369,7 +10753,9 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_ret_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -10383,7 +10769,9 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -10409,7 +10797,9 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_ret_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -10496,7 +10886,9 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_ret_atomicrmw( ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -10511,7 +10903,9 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_ret_atomicrmw( ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -10552,7 +10946,9 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_ret_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -10566,7 +10962,9 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -10592,7 +10990,9 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_ret_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -10849,6 +11249,7 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_monotonic_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_wavefront_one_as_acquire_monotonic_cmpxchg: @@ -10862,6 +11263,7 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_monotonic_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_wavefront_one_as_acquire_monotonic_cmpxchg: @@ -10899,6 +11301,7 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_acquire_monotonic_cmpxchg: @@ -10911,6 +11314,7 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_acquire_monotonic_cmpxchg: @@ -10935,6 +11339,7 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_wavefront_one_as_acquire_monotonic_cmpxchg: @@ -11015,6 +11420,7 @@ define amdgpu_kernel void @local_wavefront_one_as_release_monotonic_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX6-NEXT: s_endpgm ; @@ -11028,6 +11434,7 @@ define amdgpu_kernel void @local_wavefront_one_as_release_monotonic_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX7-NEXT: s_endpgm ; @@ -11065,6 +11472,7 @@ define amdgpu_kernel void @local_wavefront_one_as_release_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -11077,6 +11485,7 @@ define amdgpu_kernel void @local_wavefront_one_as_release_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -11101,6 +11510,7 @@ define amdgpu_kernel void @local_wavefront_one_as_release_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -11182,7 +11592,9 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_monotonic_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_wavefront_one_as_acq_rel_monotonic_cmpxchg: @@ -11195,7 +11607,9 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_monotonic_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_wavefront_one_as_acq_rel_monotonic_cmpxchg: @@ -11232,7 +11646,9 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_acq_rel_monotonic_cmpxchg: @@ -11244,7 +11660,9 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_acq_rel_monotonic_cmpxchg: @@ -11268,7 +11686,9 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_wavefront_one_as_acq_rel_monotonic_cmpxchg: @@ -11349,7 +11769,9 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_monotonic_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_wavefront_one_as_seq_cst_monotonic_cmpxchg: @@ -11362,7 +11784,9 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_monotonic_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_wavefront_one_as_seq_cst_monotonic_cmpxchg: @@ -11399,7 +11823,9 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_seq_cst_monotonic_cmpxchg: @@ -11411,7 +11837,9 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_seq_cst_monotonic_cmpxchg: @@ -11435,7 +11863,9 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_wavefront_one_as_seq_cst_monotonic_cmpxchg: @@ -11517,6 +11947,7 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_acquire_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_wavefront_one_as_monotonic_acquire_cmpxchg: @@ -11530,6 +11961,7 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_acquire_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_wavefront_one_as_monotonic_acquire_cmpxchg: @@ -11567,6 +11999,7 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_monotonic_acquire_cmpxchg: @@ -11579,6 +12012,7 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_monotonic_acquire_cmpxchg: @@ -11603,6 +12037,7 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_wavefront_one_as_monotonic_acquire_cmpxchg: @@ -11684,6 +12119,7 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_acquire_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_wavefront_one_as_acquire_acquire_cmpxchg: @@ -11697,6 +12133,7 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_acquire_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_wavefront_one_as_acquire_acquire_cmpxchg: @@ -11734,6 +12171,7 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_acquire_acquire_cmpxchg: @@ -11746,6 +12184,7 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_acquire_acquire_cmpxchg: @@ -11770,6 +12209,7 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_wavefront_one_as_acquire_acquire_cmpxchg: @@ -11850,7 +12290,9 @@ define amdgpu_kernel void @local_wavefront_one_as_release_acquire_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_wavefront_one_as_release_acquire_cmpxchg: @@ -11863,7 +12305,9 @@ define amdgpu_kernel void @local_wavefront_one_as_release_acquire_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_wavefront_one_as_release_acquire_cmpxchg: @@ -11900,7 +12344,9 @@ define amdgpu_kernel void @local_wavefront_one_as_release_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_release_acquire_cmpxchg: @@ -11912,7 +12358,9 @@ define amdgpu_kernel void @local_wavefront_one_as_release_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_release_acquire_cmpxchg: @@ -11936,7 +12384,9 @@ define amdgpu_kernel void @local_wavefront_one_as_release_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_wavefront_one_as_release_acquire_cmpxchg: @@ -12017,7 +12467,9 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_acquire_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_wavefront_one_as_acq_rel_acquire_cmpxchg: @@ -12030,7 +12482,9 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_acquire_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_wavefront_one_as_acq_rel_acquire_cmpxchg: @@ -12067,7 +12521,9 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_acq_rel_acquire_cmpxchg: @@ -12079,7 +12535,9 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_acq_rel_acquire_cmpxchg: @@ -12103,7 +12561,9 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_wavefront_one_as_acq_rel_acquire_cmpxchg: @@ -12184,7 +12644,9 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_acquire_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_wavefront_one_as_seq_cst_acquire_cmpxchg: @@ -12197,7 +12659,9 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_acquire_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_wavefront_one_as_seq_cst_acquire_cmpxchg: @@ -12234,7 +12698,9 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_seq_cst_acquire_cmpxchg: @@ -12246,7 +12712,9 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_seq_cst_acquire_cmpxchg: @@ -12270,7 +12738,9 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_wavefront_one_as_seq_cst_acquire_cmpxchg: @@ -12351,7 +12821,9 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_seq_cst_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_wavefront_one_as_monotonic_seq_cst_cmpxchg: @@ -12364,7 +12836,9 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_wavefront_one_as_monotonic_seq_cst_cmpxchg: @@ -12401,7 +12875,9 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_monotonic_seq_cst_cmpxchg: @@ -12413,7 +12889,9 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_monotonic_seq_cst_cmpxchg: @@ -12437,7 +12915,9 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_wavefront_one_as_monotonic_seq_cst_cmpxchg: @@ -12518,7 +12998,9 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_seq_cst_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_wavefront_one_as_acquire_seq_cst_cmpxchg: @@ -12531,7 +13013,9 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_wavefront_one_as_acquire_seq_cst_cmpxchg: @@ -12568,7 +13052,9 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_acquire_seq_cst_cmpxchg: @@ -12580,7 +13066,9 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_acquire_seq_cst_cmpxchg: @@ -12604,7 +13092,9 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_wavefront_one_as_acquire_seq_cst_cmpxchg: @@ -12685,7 +13175,9 @@ define amdgpu_kernel void @local_wavefront_one_as_release_seq_cst_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_wavefront_one_as_release_seq_cst_cmpxchg: @@ -12698,7 +13190,9 @@ define amdgpu_kernel void @local_wavefront_one_as_release_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_wavefront_one_as_release_seq_cst_cmpxchg: @@ -12735,7 +13229,9 @@ define amdgpu_kernel void @local_wavefront_one_as_release_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_release_seq_cst_cmpxchg: @@ -12747,7 +13243,9 @@ define amdgpu_kernel void @local_wavefront_one_as_release_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_release_seq_cst_cmpxchg: @@ -12771,7 +13269,9 @@ define amdgpu_kernel void @local_wavefront_one_as_release_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_wavefront_one_as_release_seq_cst_cmpxchg: @@ -12852,7 +13352,9 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_seq_cst_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_wavefront_one_as_acq_rel_seq_cst_cmpxchg: @@ -12865,7 +13367,9 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_wavefront_one_as_acq_rel_seq_cst_cmpxchg: @@ -12902,7 +13406,9 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_acq_rel_seq_cst_cmpxchg: @@ -12914,7 +13420,9 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_acq_rel_seq_cst_cmpxchg: @@ -12938,7 +13446,9 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_wavefront_one_as_acq_rel_seq_cst_cmpxchg: @@ -13019,7 +13529,9 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_seq_cst_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_wavefront_one_as_seq_cst_seq_cst_cmpxchg: @@ -13032,7 +13544,9 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_wavefront_one_as_seq_cst_seq_cst_cmpxchg: @@ -13069,7 +13583,9 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_seq_cst_seq_cst_cmpxchg: @@ -13081,7 +13597,9 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_seq_cst_seq_cst_cmpxchg: @@ -13105,7 +13623,9 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_wavefront_one_as_seq_cst_seq_cst_cmpxchg: @@ -13398,6 +13918,7 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_monotonic_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -13415,6 +13936,7 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_monotonic_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -13462,6 +13984,7 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -13478,6 +14001,7 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -13508,6 +14032,7 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -13608,6 +14133,7 @@ define amdgpu_kernel void @local_wavefront_one_as_release_monotonic_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 @@ -13625,6 +14151,7 @@ define amdgpu_kernel void @local_wavefront_one_as_release_monotonic_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -13672,6 +14199,7 @@ define amdgpu_kernel void @local_wavefront_one_as_release_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 @@ -13688,6 +14216,7 @@ define amdgpu_kernel void @local_wavefront_one_as_release_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -13718,6 +14247,7 @@ define amdgpu_kernel void @local_wavefront_one_as_release_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -13819,7 +14349,9 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -13836,7 +14368,9 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -13883,7 +14417,9 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -13899,7 +14435,9 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -13929,7 +14467,9 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -14030,7 +14570,9 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -14047,7 +14589,9 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -14094,7 +14638,9 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -14110,7 +14656,9 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -14140,7 +14688,9 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -14242,6 +14792,7 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_acquire_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -14259,6 +14810,7 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_acquire_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -14306,6 +14858,7 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -14322,6 +14875,7 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -14352,6 +14906,7 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -14453,6 +15008,7 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_acquire_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -14470,6 +15026,7 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_acquire_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -14517,6 +15074,7 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -14533,6 +15091,7 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -14563,6 +15122,7 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -14663,7 +15223,9 @@ define amdgpu_kernel void @local_wavefront_one_as_release_acquire_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -14680,7 +15242,9 @@ define amdgpu_kernel void @local_wavefront_one_as_release_acquire_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -14727,7 +15291,9 @@ define amdgpu_kernel void @local_wavefront_one_as_release_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -14743,7 +15309,9 @@ define amdgpu_kernel void @local_wavefront_one_as_release_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -14773,7 +15341,9 @@ define amdgpu_kernel void @local_wavefront_one_as_release_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -14874,7 +15444,9 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -14891,7 +15463,9 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -14938,7 +15512,9 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -14954,7 +15530,9 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -14984,7 +15562,9 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -15085,7 +15665,9 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -15102,7 +15684,9 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -15149,7 +15733,9 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -15165,7 +15751,9 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -15195,7 +15783,9 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -15296,7 +15886,9 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -15313,7 +15905,9 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -15360,7 +15954,9 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -15376,7 +15972,9 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -15406,7 +16004,9 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -15507,7 +16107,9 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -15524,7 +16126,9 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -15571,7 +16175,9 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -15587,7 +16193,9 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -15617,7 +16225,9 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -15718,7 +16328,9 @@ define amdgpu_kernel void @local_wavefront_one_as_release_seq_cst_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -15735,7 +16347,9 @@ define amdgpu_kernel void @local_wavefront_one_as_release_seq_cst_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -15782,7 +16396,9 @@ define amdgpu_kernel void @local_wavefront_one_as_release_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -15798,7 +16414,9 @@ define amdgpu_kernel void @local_wavefront_one_as_release_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -15828,7 +16446,9 @@ define amdgpu_kernel void @local_wavefront_one_as_release_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -15929,7 +16549,9 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -15946,7 +16568,9 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -15993,7 +16617,9 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -16009,7 +16635,9 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -16039,7 +16667,9 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -16140,7 +16770,9 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -16157,7 +16789,9 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -16204,7 +16838,9 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -16220,7 +16856,9 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -16250,7 +16888,9 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-workgroup.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-workgroup.ll index 62d7f4801baf8..538995a09e1c2 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-workgroup.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-workgroup.ll @@ -366,7 +366,7 @@ define amdgpu_kernel void @local_workgroup_acquire_load( ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: ds_read_b32 v1, v0 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: ds_write_b32 v0, v1 @@ -380,7 +380,7 @@ define amdgpu_kernel void @local_workgroup_acquire_load( ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: ds_read_b32 v1, v0 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: ds_write_b32 v0, v1 @@ -419,7 +419,7 @@ define amdgpu_kernel void @local_workgroup_acquire_load( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: ds_read_b32 v1, v0 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 @@ -432,7 +432,7 @@ define amdgpu_kernel void @local_workgroup_acquire_load( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: ds_read_b32 v1, v0 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm @@ -457,7 +457,7 @@ define amdgpu_kernel void @local_workgroup_acquire_load( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: ds_read_b32 v1, v0 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm @@ -541,9 +541,9 @@ define amdgpu_kernel void @local_workgroup_seq_cst_load( ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: ds_read_b32 v1, v0 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: ds_write_b32 v0, v1 @@ -556,9 +556,9 @@ define amdgpu_kernel void @local_workgroup_seq_cst_load( ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: ds_read_b32 v1, v0 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: ds_write_b32 v0, v1 @@ -599,9 +599,9 @@ define amdgpu_kernel void @local_workgroup_seq_cst_load( ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_read_b32 v1, v0 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 @@ -613,9 +613,9 @@ define amdgpu_kernel void @local_workgroup_seq_cst_load( ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_read_b32 v1, v0 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm @@ -640,9 +640,9 @@ define amdgpu_kernel void @local_workgroup_seq_cst_load( ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_read_b32 v1, v0 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm @@ -1015,7 +1015,7 @@ define amdgpu_kernel void @local_workgroup_release_store( ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm ; @@ -1027,7 +1027,7 @@ define amdgpu_kernel void @local_workgroup_release_store( ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: v_mov_b32_e32 v1, s4 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: ds_write_b32 v0, v1 ; GFX7-NEXT: s_endpgm ; @@ -1062,7 +1062,7 @@ define amdgpu_kernel void @local_workgroup_release_store( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -1073,7 +1073,7 @@ define amdgpu_kernel void @local_workgroup_release_store( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -1095,7 +1095,7 @@ define amdgpu_kernel void @local_workgroup_release_store( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -1173,7 +1173,7 @@ define amdgpu_kernel void @local_workgroup_seq_cst_store( ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm ; @@ -1185,7 +1185,7 @@ define amdgpu_kernel void @local_workgroup_seq_cst_store( ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: v_mov_b32_e32 v1, s4 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: ds_write_b32 v0, v1 ; GFX7-NEXT: s_endpgm ; @@ -1220,7 +1220,7 @@ define amdgpu_kernel void @local_workgroup_seq_cst_store( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -1231,7 +1231,7 @@ define amdgpu_kernel void @local_workgroup_seq_cst_store( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -1253,7 +1253,7 @@ define amdgpu_kernel void @local_workgroup_seq_cst_store( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -1472,7 +1472,7 @@ define amdgpu_kernel void @local_workgroup_acquire_atomicrmw( ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_workgroup_acquire_atomicrmw: @@ -1484,7 +1484,7 @@ define amdgpu_kernel void @local_workgroup_acquire_atomicrmw( ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: v_mov_b32_e32 v1, s4 ; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_workgroup_acquire_atomicrmw: @@ -1519,7 +1519,7 @@ define amdgpu_kernel void @local_workgroup_acquire_atomicrmw( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_acquire_atomicrmw: @@ -1530,7 +1530,7 @@ define amdgpu_kernel void @local_workgroup_acquire_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_workgroup_acquire_atomicrmw: @@ -1552,7 +1552,7 @@ define amdgpu_kernel void @local_workgroup_acquire_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_workgroup_acquire_atomicrmw: @@ -1627,7 +1627,7 @@ define amdgpu_kernel void @local_workgroup_release_atomicrmw( ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX6-NEXT: s_endpgm ; @@ -1639,7 +1639,7 @@ define amdgpu_kernel void @local_workgroup_release_atomicrmw( ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: v_mov_b32_e32 v1, s4 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX7-NEXT: s_endpgm ; @@ -1674,7 +1674,7 @@ define amdgpu_kernel void @local_workgroup_release_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -1685,7 +1685,7 @@ define amdgpu_kernel void @local_workgroup_release_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -1707,7 +1707,7 @@ define amdgpu_kernel void @local_workgroup_release_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -1785,9 +1785,9 @@ define amdgpu_kernel void @local_workgroup_acq_rel_atomicrmw( ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_workgroup_acq_rel_atomicrmw: @@ -1798,9 +1798,9 @@ define amdgpu_kernel void @local_workgroup_acq_rel_atomicrmw( ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: v_mov_b32_e32 v1, s4 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_workgroup_acq_rel_atomicrmw: @@ -1837,9 +1837,9 @@ define amdgpu_kernel void @local_workgroup_acq_rel_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_acq_rel_atomicrmw: @@ -1849,9 +1849,9 @@ define amdgpu_kernel void @local_workgroup_acq_rel_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_workgroup_acq_rel_atomicrmw: @@ -1873,9 +1873,9 @@ define amdgpu_kernel void @local_workgroup_acq_rel_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_workgroup_acq_rel_atomicrmw: @@ -1959,9 +1959,9 @@ define amdgpu_kernel void @local_workgroup_seq_cst_atomicrmw( ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_workgroup_seq_cst_atomicrmw: @@ -1972,9 +1972,9 @@ define amdgpu_kernel void @local_workgroup_seq_cst_atomicrmw( ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: v_mov_b32_e32 v1, s4 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_workgroup_seq_cst_atomicrmw: @@ -2011,9 +2011,9 @@ define amdgpu_kernel void @local_workgroup_seq_cst_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_seq_cst_atomicrmw: @@ -2023,9 +2023,9 @@ define amdgpu_kernel void @local_workgroup_seq_cst_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_workgroup_seq_cst_atomicrmw: @@ -2047,9 +2047,9 @@ define amdgpu_kernel void @local_workgroup_seq_cst_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_workgroup_seq_cst_atomicrmw: @@ -2134,7 +2134,7 @@ define amdgpu_kernel void @local_workgroup_acquire_ret_atomicrmw( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: ds_write_b32 v0, v1 @@ -2149,7 +2149,7 @@ define amdgpu_kernel void @local_workgroup_acquire_ret_atomicrmw( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: ds_write_b32 v0, v1 @@ -2191,7 +2191,7 @@ define amdgpu_kernel void @local_workgroup_acquire_ret_atomicrmw( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 @@ -2205,7 +2205,7 @@ define amdgpu_kernel void @local_workgroup_acquire_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm @@ -2232,7 +2232,7 @@ define amdgpu_kernel void @local_workgroup_acquire_ret_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm @@ -2321,9 +2321,9 @@ define amdgpu_kernel void @local_workgroup_acq_rel_ret_atomicrmw( ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: ds_write_b32 v0, v1 @@ -2337,9 +2337,9 @@ define amdgpu_kernel void @local_workgroup_acq_rel_ret_atomicrmw( ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: ds_write_b32 v0, v1 @@ -2383,9 +2383,9 @@ define amdgpu_kernel void @local_workgroup_acq_rel_ret_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 @@ -2398,9 +2398,9 @@ define amdgpu_kernel void @local_workgroup_acq_rel_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm @@ -2427,9 +2427,9 @@ define amdgpu_kernel void @local_workgroup_acq_rel_ret_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm @@ -2527,9 +2527,9 @@ define amdgpu_kernel void @local_workgroup_seq_cst_ret_atomicrmw( ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: ds_write_b32 v0, v1 @@ -2543,9 +2543,9 @@ define amdgpu_kernel void @local_workgroup_seq_cst_ret_atomicrmw( ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: ds_write_b32 v0, v1 @@ -2589,9 +2589,9 @@ define amdgpu_kernel void @local_workgroup_seq_cst_ret_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 @@ -2604,9 +2604,9 @@ define amdgpu_kernel void @local_workgroup_seq_cst_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm @@ -2633,9 +2633,9 @@ define amdgpu_kernel void @local_workgroup_seq_cst_ret_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm @@ -2903,7 +2903,7 @@ define amdgpu_kernel void @local_workgroup_acquire_monotonic_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_workgroup_acquire_monotonic_cmpxchg: @@ -2917,7 +2917,7 @@ define amdgpu_kernel void @local_workgroup_acquire_monotonic_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_workgroup_acquire_monotonic_cmpxchg: @@ -2958,7 +2958,7 @@ define amdgpu_kernel void @local_workgroup_acquire_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_acquire_monotonic_cmpxchg: @@ -2971,7 +2971,7 @@ define amdgpu_kernel void @local_workgroup_acquire_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_workgroup_acquire_monotonic_cmpxchg: @@ -2997,7 +2997,7 @@ define amdgpu_kernel void @local_workgroup_acquire_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_workgroup_acquire_monotonic_cmpxchg: @@ -3085,7 +3085,7 @@ define amdgpu_kernel void @local_workgroup_release_monotonic_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX6-NEXT: s_endpgm ; @@ -3099,7 +3099,7 @@ define amdgpu_kernel void @local_workgroup_release_monotonic_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX7-NEXT: s_endpgm ; @@ -3140,7 +3140,7 @@ define amdgpu_kernel void @local_workgroup_release_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -3153,7 +3153,7 @@ define amdgpu_kernel void @local_workgroup_release_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -3179,7 +3179,7 @@ define amdgpu_kernel void @local_workgroup_release_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -3270,9 +3270,9 @@ define amdgpu_kernel void @local_workgroup_acq_rel_monotonic_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_workgroup_acq_rel_monotonic_cmpxchg: @@ -3285,9 +3285,9 @@ define amdgpu_kernel void @local_workgroup_acq_rel_monotonic_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_workgroup_acq_rel_monotonic_cmpxchg: @@ -3330,9 +3330,9 @@ define amdgpu_kernel void @local_workgroup_acq_rel_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_acq_rel_monotonic_cmpxchg: @@ -3344,9 +3344,9 @@ define amdgpu_kernel void @local_workgroup_acq_rel_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_workgroup_acq_rel_monotonic_cmpxchg: @@ -3372,9 +3372,9 @@ define amdgpu_kernel void @local_workgroup_acq_rel_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_workgroup_acq_rel_monotonic_cmpxchg: @@ -3471,9 +3471,9 @@ define amdgpu_kernel void @local_workgroup_seq_cst_monotonic_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_workgroup_seq_cst_monotonic_cmpxchg: @@ -3486,9 +3486,9 @@ define amdgpu_kernel void @local_workgroup_seq_cst_monotonic_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_workgroup_seq_cst_monotonic_cmpxchg: @@ -3531,9 +3531,9 @@ define amdgpu_kernel void @local_workgroup_seq_cst_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_seq_cst_monotonic_cmpxchg: @@ -3545,9 +3545,9 @@ define amdgpu_kernel void @local_workgroup_seq_cst_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_workgroup_seq_cst_monotonic_cmpxchg: @@ -3573,9 +3573,9 @@ define amdgpu_kernel void @local_workgroup_seq_cst_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_workgroup_seq_cst_monotonic_cmpxchg: @@ -3673,7 +3673,7 @@ define amdgpu_kernel void @local_workgroup_monotonic_acquire_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_workgroup_monotonic_acquire_cmpxchg: @@ -3687,7 +3687,7 @@ define amdgpu_kernel void @local_workgroup_monotonic_acquire_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_workgroup_monotonic_acquire_cmpxchg: @@ -3728,7 +3728,7 @@ define amdgpu_kernel void @local_workgroup_monotonic_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_monotonic_acquire_cmpxchg: @@ -3741,7 +3741,7 @@ define amdgpu_kernel void @local_workgroup_monotonic_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_workgroup_monotonic_acquire_cmpxchg: @@ -3767,7 +3767,7 @@ define amdgpu_kernel void @local_workgroup_monotonic_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_workgroup_monotonic_acquire_cmpxchg: @@ -3856,7 +3856,7 @@ define amdgpu_kernel void @local_workgroup_acquire_acquire_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_workgroup_acquire_acquire_cmpxchg: @@ -3870,7 +3870,7 @@ define amdgpu_kernel void @local_workgroup_acquire_acquire_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_workgroup_acquire_acquire_cmpxchg: @@ -3911,7 +3911,7 @@ define amdgpu_kernel void @local_workgroup_acquire_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_acquire_acquire_cmpxchg: @@ -3924,7 +3924,7 @@ define amdgpu_kernel void @local_workgroup_acquire_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_workgroup_acquire_acquire_cmpxchg: @@ -3950,7 +3950,7 @@ define amdgpu_kernel void @local_workgroup_acquire_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_workgroup_acquire_acquire_cmpxchg: @@ -4038,9 +4038,9 @@ define amdgpu_kernel void @local_workgroup_release_acquire_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_workgroup_release_acquire_cmpxchg: @@ -4053,9 +4053,9 @@ define amdgpu_kernel void @local_workgroup_release_acquire_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_workgroup_release_acquire_cmpxchg: @@ -4098,9 +4098,9 @@ define amdgpu_kernel void @local_workgroup_release_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_release_acquire_cmpxchg: @@ -4112,9 +4112,9 @@ define amdgpu_kernel void @local_workgroup_release_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_workgroup_release_acquire_cmpxchg: @@ -4140,9 +4140,9 @@ define amdgpu_kernel void @local_workgroup_release_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_workgroup_release_acquire_cmpxchg: @@ -4239,9 +4239,9 @@ define amdgpu_kernel void @local_workgroup_acq_rel_acquire_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_workgroup_acq_rel_acquire_cmpxchg: @@ -4254,9 +4254,9 @@ define amdgpu_kernel void @local_workgroup_acq_rel_acquire_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_workgroup_acq_rel_acquire_cmpxchg: @@ -4299,9 +4299,9 @@ define amdgpu_kernel void @local_workgroup_acq_rel_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_acq_rel_acquire_cmpxchg: @@ -4313,9 +4313,9 @@ define amdgpu_kernel void @local_workgroup_acq_rel_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_workgroup_acq_rel_acquire_cmpxchg: @@ -4341,9 +4341,9 @@ define amdgpu_kernel void @local_workgroup_acq_rel_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_workgroup_acq_rel_acquire_cmpxchg: @@ -4440,9 +4440,9 @@ define amdgpu_kernel void @local_workgroup_seq_cst_acquire_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_workgroup_seq_cst_acquire_cmpxchg: @@ -4455,9 +4455,9 @@ define amdgpu_kernel void @local_workgroup_seq_cst_acquire_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_workgroup_seq_cst_acquire_cmpxchg: @@ -4500,9 +4500,9 @@ define amdgpu_kernel void @local_workgroup_seq_cst_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_seq_cst_acquire_cmpxchg: @@ -4514,9 +4514,9 @@ define amdgpu_kernel void @local_workgroup_seq_cst_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_workgroup_seq_cst_acquire_cmpxchg: @@ -4542,9 +4542,9 @@ define amdgpu_kernel void @local_workgroup_seq_cst_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_workgroup_seq_cst_acquire_cmpxchg: @@ -4641,9 +4641,9 @@ define amdgpu_kernel void @local_workgroup_monotonic_seq_cst_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_workgroup_monotonic_seq_cst_cmpxchg: @@ -4656,9 +4656,9 @@ define amdgpu_kernel void @local_workgroup_monotonic_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_workgroup_monotonic_seq_cst_cmpxchg: @@ -4701,9 +4701,9 @@ define amdgpu_kernel void @local_workgroup_monotonic_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_monotonic_seq_cst_cmpxchg: @@ -4715,9 +4715,9 @@ define amdgpu_kernel void @local_workgroup_monotonic_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_workgroup_monotonic_seq_cst_cmpxchg: @@ -4743,9 +4743,9 @@ define amdgpu_kernel void @local_workgroup_monotonic_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_workgroup_monotonic_seq_cst_cmpxchg: @@ -4842,9 +4842,9 @@ define amdgpu_kernel void @local_workgroup_acquire_seq_cst_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_workgroup_acquire_seq_cst_cmpxchg: @@ -4857,9 +4857,9 @@ define amdgpu_kernel void @local_workgroup_acquire_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_workgroup_acquire_seq_cst_cmpxchg: @@ -4902,9 +4902,9 @@ define amdgpu_kernel void @local_workgroup_acquire_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_acquire_seq_cst_cmpxchg: @@ -4916,9 +4916,9 @@ define amdgpu_kernel void @local_workgroup_acquire_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_workgroup_acquire_seq_cst_cmpxchg: @@ -4944,9 +4944,9 @@ define amdgpu_kernel void @local_workgroup_acquire_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_workgroup_acquire_seq_cst_cmpxchg: @@ -5043,9 +5043,9 @@ define amdgpu_kernel void @local_workgroup_release_seq_cst_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_workgroup_release_seq_cst_cmpxchg: @@ -5058,9 +5058,9 @@ define amdgpu_kernel void @local_workgroup_release_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_workgroup_release_seq_cst_cmpxchg: @@ -5103,9 +5103,9 @@ define amdgpu_kernel void @local_workgroup_release_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_release_seq_cst_cmpxchg: @@ -5117,9 +5117,9 @@ define amdgpu_kernel void @local_workgroup_release_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_workgroup_release_seq_cst_cmpxchg: @@ -5145,9 +5145,9 @@ define amdgpu_kernel void @local_workgroup_release_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_workgroup_release_seq_cst_cmpxchg: @@ -5244,9 +5244,9 @@ define amdgpu_kernel void @local_workgroup_acq_rel_seq_cst_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_workgroup_acq_rel_seq_cst_cmpxchg: @@ -5259,9 +5259,9 @@ define amdgpu_kernel void @local_workgroup_acq_rel_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_workgroup_acq_rel_seq_cst_cmpxchg: @@ -5304,9 +5304,9 @@ define amdgpu_kernel void @local_workgroup_acq_rel_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_acq_rel_seq_cst_cmpxchg: @@ -5318,9 +5318,9 @@ define amdgpu_kernel void @local_workgroup_acq_rel_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_workgroup_acq_rel_seq_cst_cmpxchg: @@ -5346,9 +5346,9 @@ define amdgpu_kernel void @local_workgroup_acq_rel_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_workgroup_acq_rel_seq_cst_cmpxchg: @@ -5445,9 +5445,9 @@ define amdgpu_kernel void @local_workgroup_seq_cst_seq_cst_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_workgroup_seq_cst_seq_cst_cmpxchg: @@ -5460,9 +5460,9 @@ define amdgpu_kernel void @local_workgroup_seq_cst_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_workgroup_seq_cst_seq_cst_cmpxchg: @@ -5505,9 +5505,9 @@ define amdgpu_kernel void @local_workgroup_seq_cst_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_seq_cst_seq_cst_cmpxchg: @@ -5519,9 +5519,9 @@ define amdgpu_kernel void @local_workgroup_seq_cst_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_workgroup_seq_cst_seq_cst_cmpxchg: @@ -5547,9 +5547,9 @@ define amdgpu_kernel void @local_workgroup_seq_cst_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_workgroup_seq_cst_seq_cst_cmpxchg: @@ -5858,7 +5858,7 @@ define amdgpu_kernel void @local_workgroup_acquire_monotonic_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: ds_write_b32 v0, v1 @@ -5875,7 +5875,7 @@ define amdgpu_kernel void @local_workgroup_acquire_monotonic_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: ds_write_b32 v0, v1 @@ -5923,7 +5923,7 @@ define amdgpu_kernel void @local_workgroup_acquire_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 @@ -5939,7 +5939,7 @@ define amdgpu_kernel void @local_workgroup_acquire_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm @@ -5970,7 +5970,7 @@ define amdgpu_kernel void @local_workgroup_acquire_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm @@ -6073,7 +6073,7 @@ define amdgpu_kernel void @local_workgroup_release_monotonic_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 @@ -6091,7 +6091,7 @@ define amdgpu_kernel void @local_workgroup_release_monotonic_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -6142,7 +6142,7 @@ define amdgpu_kernel void @local_workgroup_release_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 @@ -6159,7 +6159,7 @@ define amdgpu_kernel void @local_workgroup_release_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -6191,7 +6191,7 @@ define amdgpu_kernel void @local_workgroup_release_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -6302,9 +6302,9 @@ define amdgpu_kernel void @local_workgroup_acq_rel_monotonic_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: ds_write_b32 v0, v1 @@ -6320,9 +6320,9 @@ define amdgpu_kernel void @local_workgroup_acq_rel_monotonic_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: ds_write_b32 v0, v1 @@ -6372,9 +6372,9 @@ define amdgpu_kernel void @local_workgroup_acq_rel_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 @@ -6389,9 +6389,9 @@ define amdgpu_kernel void @local_workgroup_acq_rel_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm @@ -6422,9 +6422,9 @@ define amdgpu_kernel void @local_workgroup_acq_rel_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm @@ -6536,9 +6536,9 @@ define amdgpu_kernel void @local_workgroup_seq_cst_monotonic_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: ds_write_b32 v0, v1 @@ -6554,9 +6554,9 @@ define amdgpu_kernel void @local_workgroup_seq_cst_monotonic_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: ds_write_b32 v0, v1 @@ -6606,9 +6606,9 @@ define amdgpu_kernel void @local_workgroup_seq_cst_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 @@ -6623,9 +6623,9 @@ define amdgpu_kernel void @local_workgroup_seq_cst_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm @@ -6656,9 +6656,9 @@ define amdgpu_kernel void @local_workgroup_seq_cst_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm @@ -6771,7 +6771,7 @@ define amdgpu_kernel void @local_workgroup_monotonic_acquire_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: ds_write_b32 v0, v1 @@ -6788,7 +6788,7 @@ define amdgpu_kernel void @local_workgroup_monotonic_acquire_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: ds_write_b32 v0, v1 @@ -6836,7 +6836,7 @@ define amdgpu_kernel void @local_workgroup_monotonic_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 @@ -6852,7 +6852,7 @@ define amdgpu_kernel void @local_workgroup_monotonic_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm @@ -6883,7 +6883,7 @@ define amdgpu_kernel void @local_workgroup_monotonic_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm @@ -6987,7 +6987,7 @@ define amdgpu_kernel void @local_workgroup_acquire_acquire_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: ds_write_b32 v0, v1 @@ -7004,7 +7004,7 @@ define amdgpu_kernel void @local_workgroup_acquire_acquire_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: ds_write_b32 v0, v1 @@ -7052,7 +7052,7 @@ define amdgpu_kernel void @local_workgroup_acquire_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 @@ -7068,7 +7068,7 @@ define amdgpu_kernel void @local_workgroup_acquire_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm @@ -7099,7 +7099,7 @@ define amdgpu_kernel void @local_workgroup_acquire_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm @@ -7202,9 +7202,9 @@ define amdgpu_kernel void @local_workgroup_release_acquire_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: ds_write_b32 v0, v1 @@ -7220,9 +7220,9 @@ define amdgpu_kernel void @local_workgroup_release_acquire_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: ds_write_b32 v0, v1 @@ -7272,9 +7272,9 @@ define amdgpu_kernel void @local_workgroup_release_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 @@ -7289,9 +7289,9 @@ define amdgpu_kernel void @local_workgroup_release_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm @@ -7322,9 +7322,9 @@ define amdgpu_kernel void @local_workgroup_release_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm @@ -7436,9 +7436,9 @@ define amdgpu_kernel void @local_workgroup_acq_rel_acquire_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: ds_write_b32 v0, v1 @@ -7454,9 +7454,9 @@ define amdgpu_kernel void @local_workgroup_acq_rel_acquire_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: ds_write_b32 v0, v1 @@ -7506,9 +7506,9 @@ define amdgpu_kernel void @local_workgroup_acq_rel_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 @@ -7523,9 +7523,9 @@ define amdgpu_kernel void @local_workgroup_acq_rel_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm @@ -7556,9 +7556,9 @@ define amdgpu_kernel void @local_workgroup_acq_rel_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm @@ -7670,9 +7670,9 @@ define amdgpu_kernel void @local_workgroup_seq_cst_acquire_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: ds_write_b32 v0, v1 @@ -7688,9 +7688,9 @@ define amdgpu_kernel void @local_workgroup_seq_cst_acquire_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: ds_write_b32 v0, v1 @@ -7740,9 +7740,9 @@ define amdgpu_kernel void @local_workgroup_seq_cst_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 @@ -7757,9 +7757,9 @@ define amdgpu_kernel void @local_workgroup_seq_cst_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm @@ -7790,9 +7790,9 @@ define amdgpu_kernel void @local_workgroup_seq_cst_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm @@ -7904,9 +7904,9 @@ define amdgpu_kernel void @local_workgroup_monotonic_seq_cst_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: ds_write_b32 v0, v1 @@ -7922,9 +7922,9 @@ define amdgpu_kernel void @local_workgroup_monotonic_seq_cst_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: ds_write_b32 v0, v1 @@ -7974,9 +7974,9 @@ define amdgpu_kernel void @local_workgroup_monotonic_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 @@ -7991,9 +7991,9 @@ define amdgpu_kernel void @local_workgroup_monotonic_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm @@ -8024,9 +8024,9 @@ define amdgpu_kernel void @local_workgroup_monotonic_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm @@ -8138,9 +8138,9 @@ define amdgpu_kernel void @local_workgroup_acquire_seq_cst_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: ds_write_b32 v0, v1 @@ -8156,9 +8156,9 @@ define amdgpu_kernel void @local_workgroup_acquire_seq_cst_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: ds_write_b32 v0, v1 @@ -8208,9 +8208,9 @@ define amdgpu_kernel void @local_workgroup_acquire_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 @@ -8225,9 +8225,9 @@ define amdgpu_kernel void @local_workgroup_acquire_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm @@ -8258,9 +8258,9 @@ define amdgpu_kernel void @local_workgroup_acquire_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm @@ -8372,9 +8372,9 @@ define amdgpu_kernel void @local_workgroup_release_seq_cst_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: ds_write_b32 v0, v1 @@ -8390,9 +8390,9 @@ define amdgpu_kernel void @local_workgroup_release_seq_cst_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: ds_write_b32 v0, v1 @@ -8442,9 +8442,9 @@ define amdgpu_kernel void @local_workgroup_release_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 @@ -8459,9 +8459,9 @@ define amdgpu_kernel void @local_workgroup_release_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm @@ -8492,9 +8492,9 @@ define amdgpu_kernel void @local_workgroup_release_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm @@ -8606,9 +8606,9 @@ define amdgpu_kernel void @local_workgroup_acq_rel_seq_cst_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: ds_write_b32 v0, v1 @@ -8624,9 +8624,9 @@ define amdgpu_kernel void @local_workgroup_acq_rel_seq_cst_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: ds_write_b32 v0, v1 @@ -8676,9 +8676,9 @@ define amdgpu_kernel void @local_workgroup_acq_rel_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 @@ -8693,9 +8693,9 @@ define amdgpu_kernel void @local_workgroup_acq_rel_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm @@ -8726,9 +8726,9 @@ define amdgpu_kernel void @local_workgroup_acq_rel_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm @@ -8840,9 +8840,9 @@ define amdgpu_kernel void @local_workgroup_seq_cst_seq_cst_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: ds_write_b32 v0, v1 @@ -8858,9 +8858,9 @@ define amdgpu_kernel void @local_workgroup_seq_cst_seq_cst_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: ds_write_b32 v0, v1 @@ -8910,9 +8910,9 @@ define amdgpu_kernel void @local_workgroup_seq_cst_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 @@ -8927,9 +8927,9 @@ define amdgpu_kernel void @local_workgroup_seq_cst_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm @@ -8960,9 +8960,9 @@ define amdgpu_kernel void @local_workgroup_seq_cst_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm @@ -9415,6 +9415,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_load( ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: ds_read_b32 v1, v0 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -9429,6 +9430,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_load( ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: ds_read_b32 v1, v0 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -9467,6 +9469,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_load( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: ds_read_b32 v1, v0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -9480,6 +9483,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_load( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: ds_read_b32 v1, v0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -9504,6 +9508,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_load( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: ds_read_b32 v1, v0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -9585,7 +9590,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_load( ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_read_b32 v1, v0 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -9599,7 +9606,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_load( ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_read_b32 v1, v0 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -9637,7 +9646,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_load( ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_read_b32 v1, v0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -9650,7 +9661,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_load( ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_read_b32 v1, v0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -9674,7 +9687,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_load( ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_read_b32 v1, v0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -10036,6 +10051,7 @@ define amdgpu_kernel void @local_workgroup_one_as_release_store( ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm ; @@ -10047,6 +10063,7 @@ define amdgpu_kernel void @local_workgroup_one_as_release_store( ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: v_mov_b32_e32 v1, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_write_b32 v0, v1 ; GFX7-NEXT: s_endpgm ; @@ -10078,6 +10095,7 @@ define amdgpu_kernel void @local_workgroup_one_as_release_store( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -10088,6 +10106,7 @@ define amdgpu_kernel void @local_workgroup_one_as_release_store( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -10108,6 +10127,7 @@ define amdgpu_kernel void @local_workgroup_one_as_release_store( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -10176,6 +10196,7 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_store( ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm ; @@ -10187,6 +10208,7 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_store( ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: v_mov_b32_e32 v1, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_write_b32 v0, v1 ; GFX7-NEXT: s_endpgm ; @@ -10218,6 +10240,7 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_store( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -10228,6 +10251,7 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_store( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -10248,6 +10272,7 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_store( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -10457,6 +10482,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_atomicrmw( ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_workgroup_one_as_acquire_atomicrmw: @@ -10468,6 +10494,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_atomicrmw( ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: v_mov_b32_e32 v1, s4 ; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_workgroup_one_as_acquire_atomicrmw: @@ -10499,6 +10526,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_atomicrmw( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_acquire_atomicrmw: @@ -10509,6 +10537,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_acquire_atomicrmw: @@ -10529,6 +10558,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_workgroup_one_as_acquire_atomicrmw: @@ -10596,6 +10626,7 @@ define amdgpu_kernel void @local_workgroup_one_as_release_atomicrmw( ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX6-NEXT: s_endpgm ; @@ -10607,6 +10638,7 @@ define amdgpu_kernel void @local_workgroup_one_as_release_atomicrmw( ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: v_mov_b32_e32 v1, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX7-NEXT: s_endpgm ; @@ -10638,6 +10670,7 @@ define amdgpu_kernel void @local_workgroup_one_as_release_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -10648,6 +10681,7 @@ define amdgpu_kernel void @local_workgroup_one_as_release_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -10668,6 +10702,7 @@ define amdgpu_kernel void @local_workgroup_one_as_release_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -10736,7 +10771,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_atomicrmw( ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_workgroup_one_as_acq_rel_atomicrmw: @@ -10747,7 +10784,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_atomicrmw( ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: v_mov_b32_e32 v1, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_workgroup_one_as_acq_rel_atomicrmw: @@ -10778,7 +10817,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_acq_rel_atomicrmw: @@ -10788,7 +10829,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_acq_rel_atomicrmw: @@ -10808,7 +10851,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_workgroup_one_as_acq_rel_atomicrmw: @@ -10876,7 +10921,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_atomicrmw( ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_workgroup_one_as_seq_cst_atomicrmw: @@ -10887,7 +10934,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_atomicrmw( ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: v_mov_b32_e32 v1, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_workgroup_one_as_seq_cst_atomicrmw: @@ -10918,7 +10967,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_seq_cst_atomicrmw: @@ -10928,7 +10979,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_seq_cst_atomicrmw: @@ -10948,7 +11001,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_workgroup_one_as_seq_cst_atomicrmw: @@ -11017,6 +11072,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_ret_atomicrmw( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -11032,6 +11088,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_ret_atomicrmw( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -11073,6 +11130,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_ret_atomicrmw( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -11087,6 +11145,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -11113,6 +11172,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_ret_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -11199,7 +11259,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_ret_atomicrmw( ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -11214,7 +11276,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_ret_atomicrmw( ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -11255,7 +11319,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_ret_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -11269,7 +11335,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -11295,7 +11363,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_ret_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -11382,7 +11452,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_ret_atomicrmw( ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -11397,7 +11469,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_ret_atomicrmw( ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -11438,7 +11512,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_ret_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -11452,7 +11528,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -11478,7 +11556,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_ret_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -11735,6 +11815,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_monotonic_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_workgroup_one_as_acquire_monotonic_cmpxchg: @@ -11748,6 +11829,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_monotonic_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_workgroup_one_as_acquire_monotonic_cmpxchg: @@ -11785,6 +11867,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_acquire_monotonic_cmpxchg: @@ -11797,6 +11880,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_acquire_monotonic_cmpxchg: @@ -11821,6 +11905,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_workgroup_one_as_acquire_monotonic_cmpxchg: @@ -11901,6 +11986,7 @@ define amdgpu_kernel void @local_workgroup_one_as_release_monotonic_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX6-NEXT: s_endpgm ; @@ -11914,6 +12000,7 @@ define amdgpu_kernel void @local_workgroup_one_as_release_monotonic_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX7-NEXT: s_endpgm ; @@ -11951,6 +12038,7 @@ define amdgpu_kernel void @local_workgroup_one_as_release_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -11963,6 +12051,7 @@ define amdgpu_kernel void @local_workgroup_one_as_release_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -11987,6 +12076,7 @@ define amdgpu_kernel void @local_workgroup_one_as_release_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -12068,7 +12158,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_monotonic_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_workgroup_one_as_acq_rel_monotonic_cmpxchg: @@ -12081,7 +12173,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_monotonic_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_workgroup_one_as_acq_rel_monotonic_cmpxchg: @@ -12118,7 +12212,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_acq_rel_monotonic_cmpxchg: @@ -12130,7 +12226,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_acq_rel_monotonic_cmpxchg: @@ -12154,7 +12252,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_workgroup_one_as_acq_rel_monotonic_cmpxchg: @@ -12235,7 +12335,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_monotonic_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_workgroup_one_as_seq_cst_monotonic_cmpxchg: @@ -12248,7 +12350,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_monotonic_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_workgroup_one_as_seq_cst_monotonic_cmpxchg: @@ -12285,7 +12389,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_seq_cst_monotonic_cmpxchg: @@ -12297,7 +12403,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_seq_cst_monotonic_cmpxchg: @@ -12321,7 +12429,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_workgroup_one_as_seq_cst_monotonic_cmpxchg: @@ -12403,6 +12513,7 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_acquire_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_workgroup_one_as_monotonic_acquire_cmpxchg: @@ -12416,6 +12527,7 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_acquire_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_workgroup_one_as_monotonic_acquire_cmpxchg: @@ -12453,6 +12565,7 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_monotonic_acquire_cmpxchg: @@ -12465,6 +12578,7 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_monotonic_acquire_cmpxchg: @@ -12489,6 +12603,7 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_workgroup_one_as_monotonic_acquire_cmpxchg: @@ -12570,6 +12685,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_acquire_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_workgroup_one_as_acquire_acquire_cmpxchg: @@ -12583,6 +12699,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_acquire_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_workgroup_one_as_acquire_acquire_cmpxchg: @@ -12620,6 +12737,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_acquire_acquire_cmpxchg: @@ -12632,6 +12750,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_acquire_acquire_cmpxchg: @@ -12656,6 +12775,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_workgroup_one_as_acquire_acquire_cmpxchg: @@ -12736,7 +12856,9 @@ define amdgpu_kernel void @local_workgroup_one_as_release_acquire_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_workgroup_one_as_release_acquire_cmpxchg: @@ -12749,7 +12871,9 @@ define amdgpu_kernel void @local_workgroup_one_as_release_acquire_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_workgroup_one_as_release_acquire_cmpxchg: @@ -12786,7 +12910,9 @@ define amdgpu_kernel void @local_workgroup_one_as_release_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_release_acquire_cmpxchg: @@ -12798,7 +12924,9 @@ define amdgpu_kernel void @local_workgroup_one_as_release_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_release_acquire_cmpxchg: @@ -12822,7 +12950,9 @@ define amdgpu_kernel void @local_workgroup_one_as_release_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_workgroup_one_as_release_acquire_cmpxchg: @@ -12903,7 +13033,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_acquire_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_workgroup_one_as_acq_rel_acquire_cmpxchg: @@ -12916,7 +13048,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_acquire_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_workgroup_one_as_acq_rel_acquire_cmpxchg: @@ -12953,7 +13087,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_acq_rel_acquire_cmpxchg: @@ -12965,7 +13101,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_acq_rel_acquire_cmpxchg: @@ -12989,7 +13127,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_workgroup_one_as_acq_rel_acquire_cmpxchg: @@ -13070,7 +13210,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_acquire_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_workgroup_one_as_seq_cst_acquire_cmpxchg: @@ -13083,7 +13225,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_acquire_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_workgroup_one_as_seq_cst_acquire_cmpxchg: @@ -13120,7 +13264,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_seq_cst_acquire_cmpxchg: @@ -13132,7 +13278,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_seq_cst_acquire_cmpxchg: @@ -13156,7 +13304,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_workgroup_one_as_seq_cst_acquire_cmpxchg: @@ -13237,7 +13387,9 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_seq_cst_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_workgroup_one_as_monotonic_seq_cst_cmpxchg: @@ -13250,7 +13402,9 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_workgroup_one_as_monotonic_seq_cst_cmpxchg: @@ -13287,7 +13441,9 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_monotonic_seq_cst_cmpxchg: @@ -13299,7 +13455,9 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_monotonic_seq_cst_cmpxchg: @@ -13323,7 +13481,9 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_workgroup_one_as_monotonic_seq_cst_cmpxchg: @@ -13404,7 +13564,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_seq_cst_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_workgroup_one_as_acquire_seq_cst_cmpxchg: @@ -13417,7 +13579,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_workgroup_one_as_acquire_seq_cst_cmpxchg: @@ -13454,7 +13618,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_acquire_seq_cst_cmpxchg: @@ -13466,7 +13632,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_acquire_seq_cst_cmpxchg: @@ -13490,7 +13658,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_workgroup_one_as_acquire_seq_cst_cmpxchg: @@ -13571,7 +13741,9 @@ define amdgpu_kernel void @local_workgroup_one_as_release_seq_cst_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_workgroup_one_as_release_seq_cst_cmpxchg: @@ -13584,7 +13756,9 @@ define amdgpu_kernel void @local_workgroup_one_as_release_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_workgroup_one_as_release_seq_cst_cmpxchg: @@ -13621,7 +13795,9 @@ define amdgpu_kernel void @local_workgroup_one_as_release_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_release_seq_cst_cmpxchg: @@ -13633,7 +13809,9 @@ define amdgpu_kernel void @local_workgroup_one_as_release_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_release_seq_cst_cmpxchg: @@ -13657,7 +13835,9 @@ define amdgpu_kernel void @local_workgroup_one_as_release_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_workgroup_one_as_release_seq_cst_cmpxchg: @@ -13738,7 +13918,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_seq_cst_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_workgroup_one_as_acq_rel_seq_cst_cmpxchg: @@ -13751,7 +13933,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_workgroup_one_as_acq_rel_seq_cst_cmpxchg: @@ -13788,7 +13972,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_acq_rel_seq_cst_cmpxchg: @@ -13800,7 +13986,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_acq_rel_seq_cst_cmpxchg: @@ -13824,7 +14012,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_workgroup_one_as_acq_rel_seq_cst_cmpxchg: @@ -13905,7 +14095,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_seq_cst_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_workgroup_one_as_seq_cst_seq_cst_cmpxchg: @@ -13918,7 +14110,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_workgroup_one_as_seq_cst_seq_cst_cmpxchg: @@ -13955,7 +14149,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_seq_cst_seq_cst_cmpxchg: @@ -13967,7 +14163,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_seq_cst_seq_cst_cmpxchg: @@ -13991,7 +14189,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_workgroup_one_as_seq_cst_seq_cst_cmpxchg: @@ -14284,6 +14484,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_monotonic_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -14301,6 +14502,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_monotonic_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -14348,6 +14550,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -14364,6 +14567,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -14394,6 +14598,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -14494,6 +14699,7 @@ define amdgpu_kernel void @local_workgroup_one_as_release_monotonic_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 @@ -14511,6 +14717,7 @@ define amdgpu_kernel void @local_workgroup_one_as_release_monotonic_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -14558,6 +14765,7 @@ define amdgpu_kernel void @local_workgroup_one_as_release_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 @@ -14574,6 +14782,7 @@ define amdgpu_kernel void @local_workgroup_one_as_release_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -14604,6 +14813,7 @@ define amdgpu_kernel void @local_workgroup_one_as_release_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -14705,7 +14915,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -14722,7 +14934,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -14769,7 +14983,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -14785,7 +15001,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -14815,7 +15033,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -14916,7 +15136,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -14933,7 +15155,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -14980,7 +15204,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -14996,7 +15222,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -15026,7 +15254,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -15128,6 +15358,7 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_acquire_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -15145,6 +15376,7 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_acquire_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -15192,6 +15424,7 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -15208,6 +15441,7 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -15238,6 +15472,7 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -15339,6 +15574,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_acquire_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -15356,6 +15592,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_acquire_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -15403,6 +15640,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -15419,6 +15657,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -15449,6 +15688,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -15549,7 +15789,9 @@ define amdgpu_kernel void @local_workgroup_one_as_release_acquire_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -15566,7 +15808,9 @@ define amdgpu_kernel void @local_workgroup_one_as_release_acquire_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -15613,7 +15857,9 @@ define amdgpu_kernel void @local_workgroup_one_as_release_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -15629,7 +15875,9 @@ define amdgpu_kernel void @local_workgroup_one_as_release_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -15659,7 +15907,9 @@ define amdgpu_kernel void @local_workgroup_one_as_release_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -15760,7 +16010,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -15777,7 +16029,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -15824,7 +16078,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -15840,7 +16096,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -15870,7 +16128,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -15971,7 +16231,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -15988,7 +16250,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -16035,7 +16299,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -16051,7 +16317,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -16081,7 +16349,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -16182,7 +16452,9 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -16199,7 +16471,9 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -16246,7 +16520,9 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -16262,7 +16538,9 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -16292,7 +16570,9 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -16393,7 +16673,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -16410,7 +16692,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -16457,7 +16741,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -16473,7 +16759,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -16503,7 +16791,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -16604,7 +16894,9 @@ define amdgpu_kernel void @local_workgroup_one_as_release_seq_cst_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -16621,7 +16913,9 @@ define amdgpu_kernel void @local_workgroup_one_as_release_seq_cst_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -16668,7 +16962,9 @@ define amdgpu_kernel void @local_workgroup_one_as_release_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -16684,7 +16980,9 @@ define amdgpu_kernel void @local_workgroup_one_as_release_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -16714,7 +17012,9 @@ define amdgpu_kernel void @local_workgroup_one_as_release_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -16815,7 +17115,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -16832,7 +17134,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -16879,7 +17183,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -16895,7 +17201,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -16925,7 +17233,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -17026,7 +17336,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -17043,7 +17355,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -17090,7 +17404,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -17106,7 +17422,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -17136,7 +17454,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local.mir b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local.mir index adaee7ebaddd3..7ab8a51aef68d 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local.mir +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local.mir @@ -1,18 +1,23 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5 # RUN: llc -mtriple=amdgcn -mcpu=gfx803 -run-pass si-memory-legalizer %s -o - | FileCheck -check-prefix=GCN %s --- -# GCN-LABEL: name: load_singlethread_unordered -# GCN-LABEL: bb.0: -# GCN-NOT: S_WAITCNT -# GCN: DS_READ_B32 -# GCN-NOT: S_WAITCNT -# GCN: FLAT_STORE_DWORD name: load_singlethread_unordered body: | bb.0: + ; GCN-LABEL: name: load_singlethread_unordered + ; GCN: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) + ; GCN-NEXT: $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load (s64) from `ptr addrspace(4) poison`, align 4, addrspace 4) + ; GCN-NEXT: $m0 = S_MOV_B32 -1 + ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GCN-NEXT: renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("singlethread-one-as") unordered (s32) from `ptr addrspace(3) poison`, addrspace 3) + ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 + ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec + ; GCN-NEXT: FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `ptr poison`) + ; GCN-NEXT: S_ENDPGM 0 $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load (s64) from `ptr addrspace(4) poison`, align 4, addrspace 4) $m0 = S_MOV_B32 -1 @@ -26,17 +31,21 @@ body: | ... --- -# GCN-LABEL: name: load_singlethread_monotonic -# GCN-LABEL: bb.0: -# GCN-NOT: S_WAITCNT -# GCN: DS_READ_B32 -# GCN-NOT: S_WAITCNT -# GCN: FLAT_STORE_DWORD name: load_singlethread_monotonic body: | bb.0: + ; GCN-LABEL: name: load_singlethread_monotonic + ; GCN: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) + ; GCN-NEXT: $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load (s64) from `ptr addrspace(4) poison`, align 4, addrspace 4) + ; GCN-NEXT: $m0 = S_MOV_B32 -1 + ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GCN-NEXT: renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("singlethread-one-as") monotonic (s32) from `ptr addrspace(3) poison`, addrspace 3) + ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 + ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec + ; GCN-NEXT: FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `ptr poison`) + ; GCN-NEXT: S_ENDPGM 0 $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load (s64) from `ptr addrspace(4) poison`, align 4, addrspace 4) $m0 = S_MOV_B32 -1 @@ -50,17 +59,22 @@ body: | ... --- -# GCN-LABEL: name: load_singlethread_acquire -# GCN-LABEL: bb.0: -# GCN-NOT: S_WAITCNT -# GCN: DS_READ_B32 -# GCN-NOT: S_WAITCNT -# GCN: FLAT_STORE_DWORD name: load_singlethread_acquire body: | bb.0: + ; GCN-LABEL: name: load_singlethread_acquire + ; GCN: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) + ; GCN-NEXT: $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load (s64) from `ptr addrspace(4) poison`, align 4, addrspace 4) + ; GCN-NEXT: $m0 = S_MOV_B32 -1 + ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GCN-NEXT: renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("singlethread-one-as") acquire (s32) from `ptr addrspace(3) poison`, addrspace 3) + ; GCN-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952 + ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 + ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec + ; GCN-NEXT: FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `ptr poison`) + ; GCN-NEXT: S_ENDPGM 0 $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load (s64) from `ptr addrspace(4) poison`, align 4, addrspace 4) $m0 = S_MOV_B32 -1 @@ -74,17 +88,23 @@ body: | ... --- -# GCN-LABEL: name: load_singlethread_seq_cst -# GCN-LABEL: bb.0: -# GCN-NOT: S_WAITCNT -# GCN: DS_READ_B32 -# GCN-NOT: S_WAITCNT -# GCN: FLAT_STORE_DWORD name: load_singlethread_seq_cst body: | bb.0: + ; GCN-LABEL: name: load_singlethread_seq_cst + ; GCN: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) + ; GCN-NEXT: $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load (s64) from `ptr addrspace(4) poison`, align 4, addrspace 4) + ; GCN-NEXT: $m0 = S_MOV_B32 -1 + ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GCN-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952 + ; GCN-NEXT: renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("singlethread-one-as") seq_cst (s32) from `ptr addrspace(3) poison`, addrspace 3) + ; GCN-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952 + ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 + ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec + ; GCN-NEXT: FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `ptr poison`) + ; GCN-NEXT: S_ENDPGM 0 $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load (s64) from `ptr addrspace(4) poison`, align 4, addrspace 4) $m0 = S_MOV_B32 -1 @@ -98,17 +118,21 @@ body: | ... --- -# GCN-LABEL: name: load_wavefront_unordered -# GCN-LABEL: bb.0: -# GCN-NOT: S_WAITCNT -# GCN: DS_READ_B32 -# GCN-NOT: S_WAITCNT -# GCN: FLAT_STORE_DWORD name: load_wavefront_unordered body: | bb.0: + ; GCN-LABEL: name: load_wavefront_unordered + ; GCN: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) + ; GCN-NEXT: $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load (s64) from `ptr addrspace(4) poison`, align 4, addrspace 4) + ; GCN-NEXT: $m0 = S_MOV_B32 -1 + ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GCN-NEXT: renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("wavefront-one-as") unordered (s32) from `ptr addrspace(3) poison`, addrspace 3) + ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 + ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec + ; GCN-NEXT: FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `ptr poison`) + ; GCN-NEXT: S_ENDPGM 0 $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load (s64) from `ptr addrspace(4) poison`, align 4, addrspace 4) $m0 = S_MOV_B32 -1 @@ -122,17 +146,21 @@ body: | ... --- -# GCN-LABEL: name: load_wavefront_monotonic -# GCN-LABEL: bb.0: -# GCN-NOT: S_WAITCNT -# GCN: DS_READ_B32 -# GCN-NOT: S_WAITCNT -# GCN: FLAT_STORE_DWORD name: load_wavefront_monotonic body: | bb.0: + ; GCN-LABEL: name: load_wavefront_monotonic + ; GCN: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) + ; GCN-NEXT: $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load (s64) from `ptr addrspace(4) poison`, align 4, addrspace 4) + ; GCN-NEXT: $m0 = S_MOV_B32 -1 + ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GCN-NEXT: renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("wavefront-one-as") monotonic (s32) from `ptr addrspace(3) poison`, addrspace 3) + ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 + ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec + ; GCN-NEXT: FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `ptr poison`) + ; GCN-NEXT: S_ENDPGM 0 $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load (s64) from `ptr addrspace(4) poison`, align 4, addrspace 4) $m0 = S_MOV_B32 -1 @@ -146,17 +174,22 @@ body: | ... --- -# GCN-LABEL: name: load_wavefront_acquire -# GCN-LABEL: bb.0: -# GCN-NOT: S_WAITCNT -# GCN: DS_READ_B32 -# GCN-NOT: S_WAITCNT -# GCN: FLAT_STORE_DWORD name: load_wavefront_acquire body: | bb.0: + ; GCN-LABEL: name: load_wavefront_acquire + ; GCN: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) + ; GCN-NEXT: $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load (s64) from `ptr addrspace(4) poison`, align 4, addrspace 4) + ; GCN-NEXT: $m0 = S_MOV_B32 -1 + ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GCN-NEXT: renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("wavefront-one-as") acquire (s32) from `ptr addrspace(3) poison`, addrspace 3) + ; GCN-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952 + ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 + ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec + ; GCN-NEXT: FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `ptr poison`) + ; GCN-NEXT: S_ENDPGM 0 $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load (s64) from `ptr addrspace(4) poison`, align 4, addrspace 4) $m0 = S_MOV_B32 -1 @@ -170,17 +203,23 @@ body: | ... --- -# GCN-LABEL: name: load_wavefront_seq_cst -# GCN-LABEL: bb.0: -# GCN-NOT: S_WAITCNT -# GCN: DS_READ_B32 -# GCN-NOT: S_WAITCNT -# GCN: FLAT_STORE_DWORD name: load_wavefront_seq_cst body: | bb.0: + ; GCN-LABEL: name: load_wavefront_seq_cst + ; GCN: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) + ; GCN-NEXT: $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load (s64) from `ptr addrspace(4) poison`, align 4, addrspace 4) + ; GCN-NEXT: $m0 = S_MOV_B32 -1 + ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GCN-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952 + ; GCN-NEXT: renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("wavefront-one-as") seq_cst (s32) from `ptr addrspace(3) poison`, addrspace 3) + ; GCN-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952 + ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 + ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec + ; GCN-NEXT: FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `ptr poison`) + ; GCN-NEXT: S_ENDPGM 0 $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load (s64) from `ptr addrspace(4) poison`, align 4, addrspace 4) $m0 = S_MOV_B32 -1 @@ -194,17 +233,21 @@ body: | ... --- -# GCN-LABEL: name: load_workgroup_unordered -# GCN-LABEL: bb.0: -# GCN-NOT: S_WAITCNT -# GCN: DS_READ_B32 -# GCN-NOT: S_WAITCNT -# GCN: FLAT_STORE_DWORD name: load_workgroup_unordered body: | bb.0: + ; GCN-LABEL: name: load_workgroup_unordered + ; GCN: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) + ; GCN-NEXT: $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load (s64) from `ptr addrspace(4) poison`, align 4, addrspace 4) + ; GCN-NEXT: $m0 = S_MOV_B32 -1 + ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GCN-NEXT: renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("workgroup-one-as") unordered (s32) from `ptr addrspace(3) poison`, addrspace 3) + ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 + ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec + ; GCN-NEXT: FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `ptr poison`) + ; GCN-NEXT: S_ENDPGM 0 $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load (s64) from `ptr addrspace(4) poison`, align 4, addrspace 4) $m0 = S_MOV_B32 -1 @@ -218,17 +261,21 @@ body: | ... --- -# GCN-LABEL: name: load_workgroup_monotonic -# GCN-LABEL: bb.0: -# GCN-NOT: S_WAITCNT -# GCN: DS_READ_B32 -# GCN-NOT: S_WAITCNT -# GCN: FLAT_STORE_DWORD name: load_workgroup_monotonic body: | bb.0: + ; GCN-LABEL: name: load_workgroup_monotonic + ; GCN: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) + ; GCN-NEXT: $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load (s64) from `ptr addrspace(4) poison`, align 4, addrspace 4) + ; GCN-NEXT: $m0 = S_MOV_B32 -1 + ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GCN-NEXT: renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("workgroup-one-as") monotonic (s32) from `ptr addrspace(3) poison`, addrspace 3) + ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 + ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec + ; GCN-NEXT: FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `ptr poison`) + ; GCN-NEXT: S_ENDPGM 0 $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load (s64) from `ptr addrspace(4) poison`, align 4, addrspace 4) $m0 = S_MOV_B32 -1 @@ -242,17 +289,22 @@ body: | ... --- -# GCN-LABEL: name: load_workgroup_acquire -# GCN-LABEL: bb.0: -# GCN-NOT: S_WAITCNT -# GCN: DS_READ_B32 -# GCN-NOT: S_WAITCNT -# GCN: FLAT_STORE_DWORD name: load_workgroup_acquire body: | bb.0: + ; GCN-LABEL: name: load_workgroup_acquire + ; GCN: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) + ; GCN-NEXT: $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load (s64) from `ptr addrspace(4) poison`, align 4, addrspace 4) + ; GCN-NEXT: $m0 = S_MOV_B32 -1 + ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GCN-NEXT: renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("workgroup-one-as") acquire (s32) from `ptr addrspace(3) poison`, addrspace 3) + ; GCN-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952 + ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 + ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec + ; GCN-NEXT: FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `ptr poison`) + ; GCN-NEXT: S_ENDPGM 0 $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load (s64) from `ptr addrspace(4) poison`, align 4, addrspace 4) $m0 = S_MOV_B32 -1 @@ -266,17 +318,23 @@ body: | ... --- -# GCN-LABEL: name: load_workgroup_seq_cst -# GCN-LABEL: bb.0: -# GCN-NOT: S_WAITCNT -# GCN: DS_READ_B32 -# GCN-NOT: S_WAITCNT -# GCN: FLAT_STORE_DWORD name: load_workgroup_seq_cst body: | bb.0: + ; GCN-LABEL: name: load_workgroup_seq_cst + ; GCN: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) + ; GCN-NEXT: $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load (s64) from `ptr addrspace(4) poison`, align 4, addrspace 4) + ; GCN-NEXT: $m0 = S_MOV_B32 -1 + ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GCN-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952 + ; GCN-NEXT: renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("workgroup-one-as") seq_cst (s32) from `ptr addrspace(3) poison`, addrspace 3) + ; GCN-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952 + ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 + ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec + ; GCN-NEXT: FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `ptr poison`) + ; GCN-NEXT: S_ENDPGM 0 $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load (s64) from `ptr addrspace(4) poison`, align 4, addrspace 4) $m0 = S_MOV_B32 -1 @@ -290,17 +348,21 @@ body: | ... --- -# GCN-LABEL: name: load_agent_unordered -# GCN-LABEL: bb.0: -# GCN-NOT: S_WAITCNT -# GCN: DS_READ_B32 -# GCN-NOT: S_WAITCNT -# GCN: FLAT_STORE_DWORD name: load_agent_unordered body: | bb.0: + ; GCN-LABEL: name: load_agent_unordered + ; GCN: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) + ; GCN-NEXT: $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load (s64) from `ptr addrspace(4) poison`, align 4, addrspace 4) + ; GCN-NEXT: $m0 = S_MOV_B32 -1 + ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GCN-NEXT: renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("agent-one-as") unordered (s32) from `ptr addrspace(3) poison`, addrspace 3) + ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 + ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec + ; GCN-NEXT: FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `ptr poison`) + ; GCN-NEXT: S_ENDPGM 0 $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load (s64) from `ptr addrspace(4) poison`, align 4, addrspace 4) $m0 = S_MOV_B32 -1 @@ -314,17 +376,21 @@ body: | ... --- -# GCN-LABEL: name: load_agent_monotonic -# GCN-LABEL: bb.0: -# GCN-NOT: S_WAITCNT -# GCN: DS_READ_B32 -# GCN-NOT: S_WAITCNT -# GCN: FLAT_STORE_DWORD name: load_agent_monotonic body: | bb.0: + ; GCN-LABEL: name: load_agent_monotonic + ; GCN: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) + ; GCN-NEXT: $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load (s64) from `ptr addrspace(4) poison`, align 4, addrspace 4) + ; GCN-NEXT: $m0 = S_MOV_B32 -1 + ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GCN-NEXT: renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("agent-one-as") monotonic (s32) from `ptr addrspace(3) poison`, addrspace 3) + ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 + ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec + ; GCN-NEXT: FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `ptr poison`) + ; GCN-NEXT: S_ENDPGM 0 $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load (s64) from `ptr addrspace(4) poison`, align 4, addrspace 4) $m0 = S_MOV_B32 -1 @@ -338,17 +404,22 @@ body: | ... --- -# GCN-LABEL: name: load_agent_acquire -# GCN-LABEL: bb.0: -# GCN-NOT: S_WAITCNT -# GCN: DS_READ_B32 -# GCN-NOT: S_WAITCNT -# GCN: FLAT_STORE_DWORD name: load_agent_acquire body: | bb.0: + ; GCN-LABEL: name: load_agent_acquire + ; GCN: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) + ; GCN-NEXT: $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load (s64) from `ptr addrspace(4) poison`, align 4, addrspace 4) + ; GCN-NEXT: $m0 = S_MOV_B32 -1 + ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GCN-NEXT: renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("agent-one-as") acquire (s32) from `ptr addrspace(3) poison`, addrspace 3) + ; GCN-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952 + ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 + ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec + ; GCN-NEXT: FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `ptr poison`) + ; GCN-NEXT: S_ENDPGM 0 $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load (s64) from `ptr addrspace(4) poison`, align 4, addrspace 4) $m0 = S_MOV_B32 -1 @@ -362,17 +433,23 @@ body: | ... --- -# GCN-LABEL: name: load_agent_seq_cst -# GCN-LABEL: bb.0: -# GCN-NOT: S_WAITCNT -# GCN: DS_READ_B32 -# GCN-NOT: S_WAITCNT -# GCN: FLAT_STORE_DWORD name: load_agent_seq_cst body: | bb.0: + ; GCN-LABEL: name: load_agent_seq_cst + ; GCN: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) + ; GCN-NEXT: $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load (s64) from `ptr addrspace(4) poison`, align 4, addrspace 4) + ; GCN-NEXT: $m0 = S_MOV_B32 -1 + ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GCN-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952 + ; GCN-NEXT: renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("agent-one-as") seq_cst (s32) from `ptr addrspace(3) poison`, addrspace 3) + ; GCN-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952 + ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 + ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec + ; GCN-NEXT: FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `ptr poison`) + ; GCN-NEXT: S_ENDPGM 0 $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load (s64) from `ptr addrspace(4) poison`, align 4, addrspace 4) $m0 = S_MOV_B32 -1 @@ -386,17 +463,21 @@ body: | ... --- -# GCN-LABEL: name: load_system_unordered -# GCN-LABEL: bb.0: -# GCN-NOT: S_WAITCNT -# GCN: DS_READ_B32 -# GCN-NOT: S_WAITCNT -# GCN: FLAT_STORE_DWORD name: load_system_unordered body: | bb.0: + ; GCN-LABEL: name: load_system_unordered + ; GCN: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) + ; GCN-NEXT: $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load (s64) from `ptr addrspace(4) poison`, align 4, addrspace 4) + ; GCN-NEXT: $m0 = S_MOV_B32 -1 + ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GCN-NEXT: renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("one-as") unordered (s32) from `ptr addrspace(3) poison`, addrspace 3) + ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 + ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec + ; GCN-NEXT: FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `ptr poison`) + ; GCN-NEXT: S_ENDPGM 0 $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load (s64) from `ptr addrspace(4) poison`, align 4, addrspace 4) $m0 = S_MOV_B32 -1 @@ -410,17 +491,21 @@ body: | ... --- -# GCN-LABEL: name: load_system_monotonic -# GCN-LABEL: bb.0: -# GCN-NOT: S_WAITCNT -# GCN: DS_READ_B32 -# GCN-NOT: S_WAITCNT -# GCN: FLAT_STORE_DWORD name: load_system_monotonic body: | bb.0: + ; GCN-LABEL: name: load_system_monotonic + ; GCN: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) + ; GCN-NEXT: $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load (s64) from `ptr addrspace(4) poison`, align 4, addrspace 4) + ; GCN-NEXT: $m0 = S_MOV_B32 -1 + ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GCN-NEXT: renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("one-as") monotonic (s32) from `ptr addrspace(3) poison`, addrspace 3) + ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 + ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec + ; GCN-NEXT: FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `ptr poison`) + ; GCN-NEXT: S_ENDPGM 0 $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load (s64) from `ptr addrspace(4) poison`, align 4, addrspace 4) $m0 = S_MOV_B32 -1 @@ -434,17 +519,22 @@ body: | ... --- -# GCN-LABEL: name: load_system_acquire -# GCN-LABEL: bb.0: -# GCN-NOT: S_WAITCNT -# GCN: DS_READ_B32 -# GCN-NOT: S_WAITCNT -# GCN: FLAT_STORE_DWORD name: load_system_acquire body: | bb.0: + ; GCN-LABEL: name: load_system_acquire + ; GCN: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) + ; GCN-NEXT: $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load (s64) from `ptr addrspace(4) poison`, align 4, addrspace 4) + ; GCN-NEXT: $m0 = S_MOV_B32 -1 + ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GCN-NEXT: renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("one-as") acquire (s32) from `ptr addrspace(3) poison`, addrspace 3) + ; GCN-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952 + ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 + ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec + ; GCN-NEXT: FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `ptr poison`) + ; GCN-NEXT: S_ENDPGM 0 $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load (s64) from `ptr addrspace(4) poison`, align 4, addrspace 4) $m0 = S_MOV_B32 -1 @@ -458,17 +548,23 @@ body: | ... --- -# GCN-LABEL: name: load_system_seq_cst -# GCN-LABEL: bb.0: -# GCN-NOT: S_WAITCNT -# GCN: DS_READ_B32 -# GCN-NOT: S_WAITCNT -# GCN: FLAT_STORE_DWORD name: load_system_seq_cst body: | bb.0: + ; GCN-LABEL: name: load_system_seq_cst + ; GCN: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) + ; GCN-NEXT: $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load (s64) from `ptr addrspace(4) poison`, align 4, addrspace 4) + ; GCN-NEXT: $m0 = S_MOV_B32 -1 + ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GCN-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952 + ; GCN-NEXT: renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("one-as") seq_cst (s32) from `ptr addrspace(3) poison`, addrspace 3) + ; GCN-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952 + ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 + ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec + ; GCN-NEXT: FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `ptr poison`) + ; GCN-NEXT: S_ENDPGM 0 $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load (s64) from `ptr addrspace(4) poison`, align 4, addrspace 4) $m0 = S_MOV_B32 -1 @@ -482,17 +578,19 @@ body: | ... --- -# GCN-LABEL: name: store_singlethread_unordered -# GCN-LABEL: bb.0: -# GCN-NOT: S_WAITCNT -# GCN: DS_WRITE_B32 -# GCN-NOT: S_WAITCNT -# GCN: S_ENDPGM 0 name: store_singlethread_unordered body: | bb.0: + ; GCN-LABEL: name: store_singlethread_unordered + ; GCN: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) + ; GCN-NEXT: $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, align 8, addrspace 4) + ; GCN-NEXT: $m0 = S_MOV_B32 -1 + ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GCN-NEXT: DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("singlethread-one-as") unordered (s32) into `ptr addrspace(3) poison`, addrspace 3) + ; GCN-NEXT: S_ENDPGM 0 $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, align 8, addrspace 4) $m0 = S_MOV_B32 -1 @@ -504,17 +602,19 @@ body: | ... --- -# GCN-LABEL: name: store_singlethread_monotonic -# GCN-LABEL: bb.0: -# GCN-NOT: S_WAITCNT -# GCN: DS_WRITE_B32 -# GCN-NOT: S_WAITCNT -# GCN: S_ENDPGM 0 name: store_singlethread_monotonic body: | bb.0: + ; GCN-LABEL: name: store_singlethread_monotonic + ; GCN: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) + ; GCN-NEXT: $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, align 8, addrspace 4) + ; GCN-NEXT: $m0 = S_MOV_B32 -1 + ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GCN-NEXT: DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("singlethread-one-as") monotonic (s32) into `ptr addrspace(3) poison`, addrspace 3) + ; GCN-NEXT: S_ENDPGM 0 $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, align 8, addrspace 4) $m0 = S_MOV_B32 -1 @@ -526,17 +626,20 @@ body: | ... --- -# GCN-LABEL: name: store_singlethread_release -# GCN-LABEL: bb.0: -# GCN-NOT: S_WAITCNT -# GCN: DS_WRITE_B32 -# GCN-NOT: S_WAITCNT -# GCN: S_ENDPGM 0 name: store_singlethread_release body: | bb.0: + ; GCN-LABEL: name: store_singlethread_release + ; GCN: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) + ; GCN-NEXT: $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, align 8, addrspace 4) + ; GCN-NEXT: $m0 = S_MOV_B32 -1 + ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GCN-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952 + ; GCN-NEXT: DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("singlethread-one-as") release (s32) into `ptr addrspace(3) poison`, addrspace 3) + ; GCN-NEXT: S_ENDPGM 0 $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, align 8, addrspace 4) $m0 = S_MOV_B32 -1 @@ -548,17 +651,20 @@ body: | ... --- -# GCN-LABEL: name: store_singlethread_seq_cst -# GCN-LABEL: bb.0: -# GCN-NOT: S_WAITCNT -# GCN: DS_WRITE_B32 -# GCN-NOT: S_WAITCNT -# GCN: S_ENDPGM 0 name: store_singlethread_seq_cst body: | bb.0: + ; GCN-LABEL: name: store_singlethread_seq_cst + ; GCN: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) + ; GCN-NEXT: $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, align 8, addrspace 4) + ; GCN-NEXT: $m0 = S_MOV_B32 -1 + ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GCN-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952 + ; GCN-NEXT: DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("singlethread-one-as") seq_cst (s32) into `ptr addrspace(3) poison`, addrspace 3) + ; GCN-NEXT: S_ENDPGM 0 $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, align 8, addrspace 4) $m0 = S_MOV_B32 -1 @@ -570,17 +676,19 @@ body: | ... --- -# GCN-LABEL: name: store_wavefront_unordered -# GCN-LABEL: bb.0: -# GCN-NOT: S_WAITCNT -# GCN: DS_WRITE_B32 -# GCN-NOT: S_WAITCNT -# GCN: S_ENDPGM 0 name: store_wavefront_unordered body: | bb.0: + ; GCN-LABEL: name: store_wavefront_unordered + ; GCN: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) + ; GCN-NEXT: $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, align 8, addrspace 4) + ; GCN-NEXT: $m0 = S_MOV_B32 -1 + ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GCN-NEXT: DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("wavefront-one-as") unordered (s32) into `ptr addrspace(3) poison`, addrspace 3) + ; GCN-NEXT: S_ENDPGM 0 $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, align 8, addrspace 4) $m0 = S_MOV_B32 -1 @@ -592,17 +700,19 @@ body: | ... --- -# GCN-LABEL: name: store_wavefront_monotonic -# GCN-LABEL: bb.0: -# GCN-NOT: S_WAITCNT -# GCN: DS_WRITE_B32 -# GCN-NOT: S_WAITCNT -# GCN: S_ENDPGM 0 name: store_wavefront_monotonic body: | bb.0: + ; GCN-LABEL: name: store_wavefront_monotonic + ; GCN: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) + ; GCN-NEXT: $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, align 8, addrspace 4) + ; GCN-NEXT: $m0 = S_MOV_B32 -1 + ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GCN-NEXT: DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("wavefront-one-as") monotonic (s32) into `ptr addrspace(3) poison`, addrspace 3) + ; GCN-NEXT: S_ENDPGM 0 $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, align 8, addrspace 4) $m0 = S_MOV_B32 -1 @@ -614,17 +724,20 @@ body: | ... --- -# GCN-LABEL: name: store_wavefront_release -# GCN-LABEL: bb.0: -# GCN-NOT: S_WAITCNT -# GCN: DS_WRITE_B32 -# GCN-NOT: S_WAITCNT -# GCN: S_ENDPGM 0 name: store_wavefront_release body: | bb.0: + ; GCN-LABEL: name: store_wavefront_release + ; GCN: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) + ; GCN-NEXT: $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, align 8, addrspace 4) + ; GCN-NEXT: $m0 = S_MOV_B32 -1 + ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GCN-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952 + ; GCN-NEXT: DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("wavefront-one-as") release (s32) into `ptr addrspace(3) poison`, addrspace 3) + ; GCN-NEXT: S_ENDPGM 0 $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, align 8, addrspace 4) $m0 = S_MOV_B32 -1 @@ -636,17 +749,20 @@ body: | ... --- -# GCN-LABEL: name: store_wavefront_seq_cst -# GCN-LABEL: bb.0: -# GCN-NOT: S_WAITCNT -# GCN: DS_WRITE_B32 -# GCN-NOT: S_WAITCNT -# GCN: S_ENDPGM 0 name: store_wavefront_seq_cst body: | bb.0: + ; GCN-LABEL: name: store_wavefront_seq_cst + ; GCN: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) + ; GCN-NEXT: $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, align 8, addrspace 4) + ; GCN-NEXT: $m0 = S_MOV_B32 -1 + ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GCN-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952 + ; GCN-NEXT: DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("wavefront-one-as") seq_cst (s32) into `ptr addrspace(3) poison`, addrspace 3) + ; GCN-NEXT: S_ENDPGM 0 $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, align 8, addrspace 4) $m0 = S_MOV_B32 -1 @@ -658,17 +774,19 @@ body: | ... --- -# GCN-LABEL: name: store_workgroup_unordered -# GCN-LABEL: bb.0: -# GCN-NOT: S_WAITCNT -# GCN: DS_WRITE_B32 -# GCN-NOT: S_WAITCNT -# GCN: S_ENDPGM 0 name: store_workgroup_unordered body: | bb.0: + ; GCN-LABEL: name: store_workgroup_unordered + ; GCN: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) + ; GCN-NEXT: $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, align 8, addrspace 4) + ; GCN-NEXT: $m0 = S_MOV_B32 -1 + ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GCN-NEXT: DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("workgroup-one-as") unordered (s32) into `ptr addrspace(3) poison`, addrspace 3) + ; GCN-NEXT: S_ENDPGM 0 $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, align 8, addrspace 4) $m0 = S_MOV_B32 -1 @@ -680,17 +798,19 @@ body: | ... --- -# GCN-LABEL: name: store_workgroup_monotonic -# GCN-LABEL: bb.0: -# GCN-NOT: S_WAITCNT -# GCN: DS_WRITE_B32 -# GCN-NOT: S_WAITCNT -# GCN: S_ENDPGM 0 name: store_workgroup_monotonic body: | bb.0: + ; GCN-LABEL: name: store_workgroup_monotonic + ; GCN: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) + ; GCN-NEXT: $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, align 8, addrspace 4) + ; GCN-NEXT: $m0 = S_MOV_B32 -1 + ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GCN-NEXT: DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("workgroup-one-as") monotonic (s32) into `ptr addrspace(3) poison`, addrspace 3) + ; GCN-NEXT: S_ENDPGM 0 $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, align 8, addrspace 4) $m0 = S_MOV_B32 -1 @@ -702,17 +822,20 @@ body: | ... --- -# GCN-LABEL: name: store_workgroup_release -# GCN-LABEL: bb.0: -# GCN-NOT: S_WAITCNT -# GCN: DS_WRITE_B32 -# GCN-NOT: S_WAITCNT -# GCN: S_ENDPGM 0 name: store_workgroup_release body: | bb.0: + ; GCN-LABEL: name: store_workgroup_release + ; GCN: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) + ; GCN-NEXT: $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, align 8, addrspace 4) + ; GCN-NEXT: $m0 = S_MOV_B32 -1 + ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GCN-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952 + ; GCN-NEXT: DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("workgroup-one-as") release (s32) into `ptr addrspace(3) poison`, addrspace 3) + ; GCN-NEXT: S_ENDPGM 0 $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, align 8, addrspace 4) $m0 = S_MOV_B32 -1 @@ -724,17 +847,20 @@ body: | ... --- -# GCN-LABEL: name: store_workgroup_seq_cst -# GCN-LABEL: bb.0: -# GCN-NOT: S_WAITCNT -# GCN: DS_WRITE_B32 -# GCN-NOT: S_WAITCNT -# GCN: S_ENDPGM 0 name: store_workgroup_seq_cst body: | bb.0: + ; GCN-LABEL: name: store_workgroup_seq_cst + ; GCN: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) + ; GCN-NEXT: $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, align 8, addrspace 4) + ; GCN-NEXT: $m0 = S_MOV_B32 -1 + ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GCN-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952 + ; GCN-NEXT: DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("workgroup-one-as") seq_cst (s32) into `ptr addrspace(3) poison`, addrspace 3) + ; GCN-NEXT: S_ENDPGM 0 $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, align 8, addrspace 4) $m0 = S_MOV_B32 -1 @@ -746,17 +872,19 @@ body: | ... --- -# GCN-LABEL: name: store_agent_unordered -# GCN-LABEL: bb.0: -# GCN-NOT: S_WAITCNT -# GCN: DS_WRITE_B32 -# GCN-NOT: S_WAITCNT -# GCN: S_ENDPGM 0 name: store_agent_unordered body: | bb.0: + ; GCN-LABEL: name: store_agent_unordered + ; GCN: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) + ; GCN-NEXT: $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, align 8, addrspace 4) + ; GCN-NEXT: $m0 = S_MOV_B32 -1 + ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GCN-NEXT: DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("agent-one-as") unordered (s32) into `ptr addrspace(3) poison`, addrspace 3) + ; GCN-NEXT: S_ENDPGM 0 $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, align 8, addrspace 4) $m0 = S_MOV_B32 -1 @@ -768,17 +896,19 @@ body: | ... --- -# GCN-LABEL: name: store_agent_monotonic -# GCN-LABEL: bb.0: -# GCN-NOT: S_WAITCNT -# GCN: DS_WRITE_B32 -# GCN-NOT: S_WAITCNT -# GCN: S_ENDPGM 0 name: store_agent_monotonic body: | bb.0: + ; GCN-LABEL: name: store_agent_monotonic + ; GCN: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) + ; GCN-NEXT: $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, align 8, addrspace 4) + ; GCN-NEXT: $m0 = S_MOV_B32 -1 + ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GCN-NEXT: DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("agent-one-as") monotonic (s32) into `ptr addrspace(3) poison`, addrspace 3) + ; GCN-NEXT: S_ENDPGM 0 $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, align 8, addrspace 4) $m0 = S_MOV_B32 -1 @@ -790,17 +920,20 @@ body: | ... --- -# GCN-LABEL: name: store_agent_release -# GCN-LABEL: bb.0: -# GCN-NOT: S_WAITCNT -# GCN: DS_WRITE_B32 -# GCN-NOT: S_WAITCNT -# GCN: S_ENDPGM 0 name: store_agent_release body: | bb.0: + ; GCN-LABEL: name: store_agent_release + ; GCN: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) + ; GCN-NEXT: $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, align 8, addrspace 4) + ; GCN-NEXT: $m0 = S_MOV_B32 -1 + ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GCN-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952 + ; GCN-NEXT: DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("agent-one-as") release (s32) into `ptr addrspace(3) poison`, addrspace 3) + ; GCN-NEXT: S_ENDPGM 0 $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, align 8, addrspace 4) $m0 = S_MOV_B32 -1 @@ -812,17 +945,20 @@ body: | ... --- -# GCN-LABEL: name: store_agent_seq_cst -# GCN-LABEL: bb.0: -# GCN-NOT: S_WAITCNT -# GCN: DS_WRITE_B32 -# GCN-NOT: S_WAITCNT -# GCN: S_ENDPGM 0 name: store_agent_seq_cst body: | bb.0: + ; GCN-LABEL: name: store_agent_seq_cst + ; GCN: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) + ; GCN-NEXT: $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, align 8, addrspace 4) + ; GCN-NEXT: $m0 = S_MOV_B32 -1 + ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GCN-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952 + ; GCN-NEXT: DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("agent-one-as") seq_cst (s32) into `ptr addrspace(3) poison`, addrspace 3) + ; GCN-NEXT: S_ENDPGM 0 $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, align 8, addrspace 4) $m0 = S_MOV_B32 -1 @@ -834,17 +970,19 @@ body: | ... --- -# GCN-LABEL: name: store_system_unordered -# GCN-LABEL: bb.0: -# GCN-NOT: S_WAITCNT -# GCN: DS_WRITE_B32 -# GCN-NOT: S_WAITCNT -# GCN: S_ENDPGM 0 name: store_system_unordered body: | bb.0: + ; GCN-LABEL: name: store_system_unordered + ; GCN: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) + ; GCN-NEXT: $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, align 8, addrspace 4) + ; GCN-NEXT: $m0 = S_MOV_B32 -1 + ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GCN-NEXT: DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("one-as") unordered (s32) into `ptr addrspace(3) poison`, addrspace 3) + ; GCN-NEXT: S_ENDPGM 0 $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, align 8, addrspace 4) $m0 = S_MOV_B32 -1 @@ -856,17 +994,19 @@ body: | ... --- -# GCN-LABEL: name: store_system_monotonic -# GCN-LABEL: bb.0: -# GCN-NOT: S_WAITCNT -# GCN: DS_WRITE_B32 -# GCN-NOT: S_WAITCNT -# GCN: S_ENDPGM 0 name: store_system_monotonic body: | bb.0: + ; GCN-LABEL: name: store_system_monotonic + ; GCN: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) + ; GCN-NEXT: $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, align 8, addrspace 4) + ; GCN-NEXT: $m0 = S_MOV_B32 -1 + ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GCN-NEXT: DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("one-as") monotonic (s32) into `ptr addrspace(3) poison`, addrspace 3) + ; GCN-NEXT: S_ENDPGM 0 $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, align 8, addrspace 4) $m0 = S_MOV_B32 -1 @@ -878,17 +1018,20 @@ body: | ... --- -# GCN-LABEL: name: store_system_release -# GCN-LABEL: bb.0: -# GCN-NOT: S_WAITCNT -# GCN: DS_WRITE_B32 -# GCN-NOT: S_WAITCNT -# GCN: S_ENDPGM 0 name: store_system_release body: | bb.0: + ; GCN-LABEL: name: store_system_release + ; GCN: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) + ; GCN-NEXT: $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, align 8, addrspace 4) + ; GCN-NEXT: $m0 = S_MOV_B32 -1 + ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GCN-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952 + ; GCN-NEXT: DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("one-as") release (s32) into `ptr addrspace(3) poison`, addrspace 3) + ; GCN-NEXT: S_ENDPGM 0 $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, align 8, addrspace 4) $m0 = S_MOV_B32 -1 @@ -900,17 +1043,20 @@ body: | ... --- -# GCN-LABEL: name: store_system_seq_cst -# GCN-LABEL: bb.0: -# GCN-NOT: S_WAITCNT -# GCN: DS_WRITE_B32 -# GCN-NOT: S_WAITCNT -# GCN: S_ENDPGM 0 name: store_system_seq_cst body: | bb.0: + ; GCN-LABEL: name: store_system_seq_cst + ; GCN: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) + ; GCN-NEXT: $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, align 8, addrspace 4) + ; GCN-NEXT: $m0 = S_MOV_B32 -1 + ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GCN-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952 + ; GCN-NEXT: DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("one-as") seq_cst (s32) into `ptr addrspace(3) poison`, addrspace 3) + ; GCN-NEXT: S_ENDPGM 0 $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, align 8, addrspace 4) $m0 = S_MOV_B32 -1 @@ -922,17 +1068,19 @@ body: | ... --- -# GCN-LABEL: name: atomicrmw_singlethread_unordered -# GCN-LABEL: bb.0: -# GCN-NOT: S_WAITCNT -# GCN: DS_WRXCHG_RTN_B32 -# GCN-NOT: S_WAITCNT -# GCN: S_ENDPGM 0 name: atomicrmw_singlethread_unordered body: | bb.0: + ; GCN-LABEL: name: atomicrmw_singlethread_unordered + ; GCN: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) + ; GCN-NEXT: $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, align 8, addrspace 4) + ; GCN-NEXT: $m0 = S_MOV_B32 -1 + ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GCN-NEXT: $vgpr2 = DS_WRXCHG_RTN_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("singlethread-one-as") unordered (s32) into `ptr addrspace(3) poison`, addrspace 3) + ; GCN-NEXT: S_ENDPGM 0 $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, align 8, addrspace 4) $m0 = S_MOV_B32 -1 @@ -944,17 +1092,19 @@ body: | ... --- -# GCN-LABEL: name: atomicrmw_singlethread_monotonic -# GCN-LABEL: bb.0: -# GCN-NOT: S_WAITCNT -# GCN: DS_WRXCHG_RTN_B32 -# GCN-NOT: S_WAITCNT -# GCN: S_ENDPGM 0 name: atomicrmw_singlethread_monotonic body: | bb.0: + ; GCN-LABEL: name: atomicrmw_singlethread_monotonic + ; GCN: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) + ; GCN-NEXT: $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, align 8, addrspace 4) + ; GCN-NEXT: $m0 = S_MOV_B32 -1 + ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GCN-NEXT: $vgpr2 = DS_WRXCHG_RTN_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("singlethread-one-as") monotonic (s32) into `ptr addrspace(3) poison`, addrspace 3) + ; GCN-NEXT: S_ENDPGM 0 $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, align 8, addrspace 4) $m0 = S_MOV_B32 -1 @@ -966,17 +1116,20 @@ body: | ... --- -# GCN-LABEL: name: atomicrmw_singlethread_acquire -# GCN-LABEL: bb.0: -# GCN-NOT: S_WAITCNT -# GCN: DS_WRXCHG_RTN_B32 -# GCN-NOT: S_WAITCNT -# GCN: S_ENDPGM 0 name: atomicrmw_singlethread_acquire body: | bb.0: + ; GCN-LABEL: name: atomicrmw_singlethread_acquire + ; GCN: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) + ; GCN-NEXT: $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, align 8, addrspace 4) + ; GCN-NEXT: $m0 = S_MOV_B32 -1 + ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GCN-NEXT: $vgpr2 = DS_WRXCHG_RTN_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("singlethread-one-as") acquire (s32) into `ptr addrspace(3) poison`, addrspace 3) + ; GCN-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952 + ; GCN-NEXT: S_ENDPGM 0 $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, align 8, addrspace 4) $m0 = S_MOV_B32 -1 @@ -988,17 +1141,20 @@ body: | ... --- -# GCN-LABEL: name: atomicrmw_singlethread_release -# GCN-LABEL: bb.0: -# GCN-NOT: S_WAITCNT -# GCN: DS_WRXCHG_RTN_B32 -# GCN-NOT: S_WAITCNT -# GCN: S_ENDPGM 0 name: atomicrmw_singlethread_release body: | bb.0: + ; GCN-LABEL: name: atomicrmw_singlethread_release + ; GCN: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) + ; GCN-NEXT: $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, align 8, addrspace 4) + ; GCN-NEXT: $m0 = S_MOV_B32 -1 + ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GCN-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952 + ; GCN-NEXT: $vgpr2 = DS_WRXCHG_RTN_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("singlethread-one-as") release (s32) into `ptr addrspace(3) poison`, addrspace 3) + ; GCN-NEXT: S_ENDPGM 0 $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, align 8, addrspace 4) $m0 = S_MOV_B32 -1 @@ -1010,17 +1166,21 @@ body: | ... --- -# GCN-LABEL: name: atomicrmw_singlethread_acq_rel -# GCN-LABEL: bb.0: -# GCN-NOT: S_WAITCNT -# GCN: DS_WRXCHG_RTN_B32 -# GCN-NOT: S_WAITCNT -# GCN: S_ENDPGM 0 name: atomicrmw_singlethread_acq_rel body: | bb.0: + ; GCN-LABEL: name: atomicrmw_singlethread_acq_rel + ; GCN: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) + ; GCN-NEXT: $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, align 8, addrspace 4) + ; GCN-NEXT: $m0 = S_MOV_B32 -1 + ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GCN-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952 + ; GCN-NEXT: $vgpr2 = DS_WRXCHG_RTN_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("singlethread-one-as") acq_rel (s32) into `ptr addrspace(3) poison`, addrspace 3) + ; GCN-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952 + ; GCN-NEXT: S_ENDPGM 0 $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, align 8, addrspace 4) $m0 = S_MOV_B32 -1 @@ -1032,17 +1192,21 @@ body: | ... --- -# GCN-LABEL: name: atomicrmw_singlethread_seq_cst -# GCN-LABEL: bb.0: -# GCN-NOT: S_WAITCNT -# GCN: DS_WRXCHG_RTN_B32 -# GCN-NOT: S_WAITCNT -# GCN: S_ENDPGM 0 name: atomicrmw_singlethread_seq_cst body: | bb.0: + ; GCN-LABEL: name: atomicrmw_singlethread_seq_cst + ; GCN: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) + ; GCN-NEXT: $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, align 8, addrspace 4) + ; GCN-NEXT: $m0 = S_MOV_B32 -1 + ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GCN-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952 + ; GCN-NEXT: $vgpr2 = DS_WRXCHG_RTN_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("singlethread-one-as") seq_cst (s32) into `ptr addrspace(3) poison`, addrspace 3) + ; GCN-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952 + ; GCN-NEXT: S_ENDPGM 0 $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, align 8, addrspace 4) $m0 = S_MOV_B32 -1 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-region.mir b/llvm/test/CodeGen/AMDGPU/memory-legalizer-region.mir index 9405c8a946627..fd5c715ad0c60 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-region.mir +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-region.mir @@ -1,18 +1,23 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5 # RUN: llc -mtriple=amdgcn -mcpu=gfx803 -run-pass si-memory-legalizer %s -o - | FileCheck -check-prefix=GCN %s --- -# GCN-LABEL: name: load_singlethread_unordered -# GCN-LABEL: bb.0: -# GCN-NOT: S_WAITCNT -# GCN: DS_READ_B32 -# GCN-NOT: S_WAITCNT -# GCN: FLAT_STORE_DWORD name: load_singlethread_unordered body: | bb.0: + ; GCN-LABEL: name: load_singlethread_unordered + ; GCN: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) + ; GCN-NEXT: $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load (s64) from `ptr addrspace(4) poison`, align 4, addrspace 4) + ; GCN-NEXT: $m0 = S_MOV_B32 -1 + ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GCN-NEXT: renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 1, 0, implicit $m0, implicit $exec :: (volatile load syncscope("singlethread-one-as") unordered (s32) from `ptr addrspace(2) poison`, addrspace 2) + ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 + ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec + ; GCN-NEXT: FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `ptr poison`) + ; GCN-NEXT: S_ENDPGM 0 $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load (s64) from `ptr addrspace(4) poison`, align 4, addrspace 4) $m0 = S_MOV_B32 -1 @@ -26,17 +31,21 @@ body: | ... --- -# GCN-LABEL: name: load_singlethread_monotonic -# GCN-LABEL: bb.0: -# GCN-NOT: S_WAITCNT -# GCN: DS_READ_B32 -# GCN-NOT: S_WAITCNT -# GCN: FLAT_STORE_DWORD name: load_singlethread_monotonic body: | bb.0: + ; GCN-LABEL: name: load_singlethread_monotonic + ; GCN: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) + ; GCN-NEXT: $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load (s64) from `ptr addrspace(4) poison`, align 4, addrspace 4) + ; GCN-NEXT: $m0 = S_MOV_B32 -1 + ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GCN-NEXT: renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load syncscope("singlethread-one-as") monotonic (s32) from `ptr addrspace(2) poison`, addrspace 2) + ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 + ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec + ; GCN-NEXT: FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `ptr poison`) + ; GCN-NEXT: S_ENDPGM 0 $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load (s64) from `ptr addrspace(4) poison`, align 4, addrspace 4) $m0 = S_MOV_B32 -1 @@ -50,17 +59,21 @@ body: | ... --- -# GCN-LABEL: name: load_singlethread_acquire -# GCN-LABEL: bb.0: -# GCN-NOT: S_WAITCNT -# GCN: DS_READ_B32 -# GCN-NOT: S_WAITCNT -# GCN: FLAT_STORE_DWORD name: load_singlethread_acquire body: | bb.0: + ; GCN-LABEL: name: load_singlethread_acquire + ; GCN: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) + ; GCN-NEXT: $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load (s64) from `ptr addrspace(4) poison`, align 4, addrspace 4) + ; GCN-NEXT: $m0 = S_MOV_B32 -1 + ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GCN-NEXT: renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load syncscope("singlethread-one-as") acquire (s32) from `ptr addrspace(2) poison`, addrspace 2) + ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 + ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec + ; GCN-NEXT: FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `ptr poison`) + ; GCN-NEXT: S_ENDPGM 0 $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load (s64) from `ptr addrspace(4) poison`, align 4, addrspace 4) $m0 = S_MOV_B32 -1 @@ -74,17 +87,22 @@ body: | ... --- -# GCN-LABEL: name: load_singlethread_seq_cst -# GCN-LABEL: bb.0: -# GCN-NOT: S_WAITCNT -# GCN: DS_READ_B32 -# GCN-NOT: S_WAITCNT -# GCN: FLAT_STORE_DWORD name: load_singlethread_seq_cst body: | bb.0: + ; GCN-LABEL: name: load_singlethread_seq_cst + ; GCN: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) + ; GCN-NEXT: $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load (s64) from `ptr addrspace(4) poison`, align 4, addrspace 4) + ; GCN-NEXT: $m0 = S_MOV_B32 -1 + ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GCN-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952 + ; GCN-NEXT: renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load syncscope("singlethread-one-as") seq_cst (s32) from `ptr addrspace(2) poison`, addrspace 2) + ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 + ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec + ; GCN-NEXT: FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `ptr poison`) + ; GCN-NEXT: S_ENDPGM 0 $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load (s64) from `ptr addrspace(4) poison`, align 4, addrspace 4) $m0 = S_MOV_B32 -1 @@ -98,17 +116,21 @@ body: | ... --- -# GCN-LABEL: name: load_wavefront_unordered -# GCN-LABEL: bb.0: -# GCN-NOT: S_WAITCNT -# GCN: DS_READ_B32 -# GCN-NOT: S_WAITCNT -# GCN: FLAT_STORE_DWORD name: load_wavefront_unordered body: | bb.0: + ; GCN-LABEL: name: load_wavefront_unordered + ; GCN: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) + ; GCN-NEXT: $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load (s64) from `ptr addrspace(4) poison`, align 4, addrspace 4) + ; GCN-NEXT: $m0 = S_MOV_B32 -1 + ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GCN-NEXT: renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load syncscope("wavefront-one-as") unordered (s32) from `ptr addrspace(2) poison`, addrspace 2) + ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 + ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec + ; GCN-NEXT: FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `ptr poison`) + ; GCN-NEXT: S_ENDPGM 0 $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load (s64) from `ptr addrspace(4) poison`, align 4, addrspace 4) $m0 = S_MOV_B32 -1 @@ -122,17 +144,21 @@ body: | ... --- -# GCN-LABEL: name: load_wavefront_monotonic -# GCN-LABEL: bb.0: -# GCN-NOT: S_WAITCNT -# GCN: DS_READ_B32 -# GCN-NOT: S_WAITCNT -# GCN: FLAT_STORE_DWORD name: load_wavefront_monotonic body: | bb.0: + ; GCN-LABEL: name: load_wavefront_monotonic + ; GCN: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) + ; GCN-NEXT: $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load (s64) from `ptr addrspace(4) poison`, align 4, addrspace 4) + ; GCN-NEXT: $m0 = S_MOV_B32 -1 + ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GCN-NEXT: renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load syncscope("wavefront-one-as") monotonic (s32) from `ptr addrspace(2) poison`, addrspace 2) + ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 + ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec + ; GCN-NEXT: FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `ptr poison`) + ; GCN-NEXT: S_ENDPGM 0 $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load (s64) from `ptr addrspace(4) poison`, align 4, addrspace 4) $m0 = S_MOV_B32 -1 @@ -146,17 +172,21 @@ body: | ... --- -# GCN-LABEL: name: load_wavefront_acquire -# GCN-LABEL: bb.0: -# GCN-NOT: S_WAITCNT -# GCN: DS_READ_B32 -# GCN-NOT: S_WAITCNT -# GCN: FLAT_STORE_DWORD name: load_wavefront_acquire body: | bb.0: + ; GCN-LABEL: name: load_wavefront_acquire + ; GCN: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) + ; GCN-NEXT: $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load (s64) from `ptr addrspace(4) poison`, align 4, addrspace 4) + ; GCN-NEXT: $m0 = S_MOV_B32 -1 + ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GCN-NEXT: renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load syncscope("wavefront-one-as") acquire (s32) from `ptr addrspace(2) poison`, addrspace 2) + ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 + ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec + ; GCN-NEXT: FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `ptr poison`) + ; GCN-NEXT: S_ENDPGM 0 $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load (s64) from `ptr addrspace(4) poison`, align 4, addrspace 4) $m0 = S_MOV_B32 -1 @@ -170,17 +200,21 @@ body: | ... --- -# GCN-LABEL: name: load_wavefront_seq_cst -# GCN-LABEL: bb.0: -# GCN-NOT: S_WAITCNT -# GCN: DS_READ_B32 -# GCN-NOT: S_WAITCNT -# GCN: FLAT_STORE_DWORD name: load_wavefront_seq_cst body: | bb.0: + ; GCN-LABEL: name: load_wavefront_seq_cst + ; GCN: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) + ; GCN-NEXT: $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load (s64) from `ptr addrspace(4) poison`, align 4, addrspace 4) + ; GCN-NEXT: $m0 = S_MOV_B32 -1 + ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GCN-NEXT: renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load syncscope("wavefront-one-as") seq_cst (s32) from `ptr addrspace(2) poison`, addrspace 2) + ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 + ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec + ; GCN-NEXT: FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `ptr poison`) + ; GCN-NEXT: S_ENDPGM 0 $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load (s64) from `ptr addrspace(4) poison`, align 4, addrspace 4) $m0 = S_MOV_B32 -1 @@ -194,17 +228,21 @@ body: | ... --- -# GCN-LABEL: name: load_workgroup_unordered -# GCN-LABEL: bb.0: -# GCN-NOT: S_WAITCNT -# GCN: DS_READ_B32 -# GCN-NOT: S_WAITCNT -# GCN: FLAT_STORE_DWORD name: load_workgroup_unordered body: | bb.0: + ; GCN-LABEL: name: load_workgroup_unordered + ; GCN: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) + ; GCN-NEXT: $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load (s64) from `ptr addrspace(4) poison`, align 4, addrspace 4) + ; GCN-NEXT: $m0 = S_MOV_B32 -1 + ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GCN-NEXT: renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load syncscope("workgroup-one-as") unordered (s32) from `ptr addrspace(2) poison`, addrspace 2) + ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 + ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec + ; GCN-NEXT: FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `ptr poison`) + ; GCN-NEXT: S_ENDPGM 0 $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load (s64) from `ptr addrspace(4) poison`, align 4, addrspace 4) $m0 = S_MOV_B32 -1 @@ -218,17 +256,21 @@ body: | ... --- -# GCN-LABEL: name: load_workgroup_monotonic -# GCN-LABEL: bb.0: -# GCN-NOT: S_WAITCNT -# GCN: DS_READ_B32 -# GCN-NOT: S_WAITCNT -# GCN: FLAT_STORE_DWORD name: load_workgroup_monotonic body: | bb.0: + ; GCN-LABEL: name: load_workgroup_monotonic + ; GCN: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) + ; GCN-NEXT: $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load (s64) from `ptr addrspace(4) poison`, align 4, addrspace 4) + ; GCN-NEXT: $m0 = S_MOV_B32 -1 + ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GCN-NEXT: renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load syncscope("workgroup-one-as") monotonic (s32) from `ptr addrspace(2) poison`, addrspace 2) + ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 + ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec + ; GCN-NEXT: FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `ptr poison`) + ; GCN-NEXT: S_ENDPGM 0 $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load (s64) from `ptr addrspace(4) poison`, align 4, addrspace 4) $m0 = S_MOV_B32 -1 @@ -242,17 +284,21 @@ body: | ... --- -# GCN-LABEL: name: load_workgroup_acquire -# GCN-LABEL: bb.0: -# GCN-NOT: S_WAITCNT -# GCN: DS_READ_B32 -# GCN-NOT: S_WAITCNT -# GCN: FLAT_STORE_DWORD name: load_workgroup_acquire body: | bb.0: + ; GCN-LABEL: name: load_workgroup_acquire + ; GCN: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) + ; GCN-NEXT: $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load (s64) from `ptr addrspace(4) poison`, align 4, addrspace 4) + ; GCN-NEXT: $m0 = S_MOV_B32 -1 + ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GCN-NEXT: renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load syncscope("workgroup-one-as") acquire (s32) from `ptr addrspace(2) poison`, addrspace 2) + ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 + ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec + ; GCN-NEXT: FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `ptr poison`) + ; GCN-NEXT: S_ENDPGM 0 $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load (s64) from `ptr addrspace(4) poison`, align 4, addrspace 4) $m0 = S_MOV_B32 -1 @@ -266,17 +312,21 @@ body: | ... --- -# GCN-LABEL: name: load_workgroup_seq_cst -# GCN-LABEL: bb.0: -# GCN-NOT: S_WAITCNT -# GCN: DS_READ_B32 -# GCN-NOT: S_WAITCNT -# GCN: FLAT_STORE_DWORD name: load_workgroup_seq_cst body: | bb.0: + ; GCN-LABEL: name: load_workgroup_seq_cst + ; GCN: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) + ; GCN-NEXT: $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load (s64) from `ptr addrspace(4) poison`, align 4, addrspace 4) + ; GCN-NEXT: $m0 = S_MOV_B32 -1 + ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GCN-NEXT: renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load syncscope("workgroup-one-as") seq_cst (s32) from `ptr addrspace(2) poison`, addrspace 2) + ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 + ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec + ; GCN-NEXT: FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `ptr poison`) + ; GCN-NEXT: S_ENDPGM 0 $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load (s64) from `ptr addrspace(4) poison`, align 4, addrspace 4) $m0 = S_MOV_B32 -1 @@ -290,17 +340,21 @@ body: | ... --- -# GCN-LABEL: name: load_agent_unordered -# GCN-LABEL: bb.0: -# GCN-NOT: S_WAITCNT -# GCN: DS_READ_B32 -# GCN-NOT: S_WAITCNT -# GCN: FLAT_STORE_DWORD name: load_agent_unordered body: | bb.0: + ; GCN-LABEL: name: load_agent_unordered + ; GCN: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) + ; GCN-NEXT: $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load (s64) from `ptr addrspace(4) poison`, align 4, addrspace 4) + ; GCN-NEXT: $m0 = S_MOV_B32 -1 + ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GCN-NEXT: renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load syncscope("agent-one-as") unordered (s32) from `ptr addrspace(2) poison`, addrspace 2) + ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 + ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec + ; GCN-NEXT: FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `ptr poison`) + ; GCN-NEXT: S_ENDPGM 0 $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load (s64) from `ptr addrspace(4) poison`, align 4, addrspace 4) $m0 = S_MOV_B32 -1 @@ -314,17 +368,21 @@ body: | ... --- -# GCN-LABEL: name: load_agent_monotonic -# GCN-LABEL: bb.0: -# GCN-NOT: S_WAITCNT -# GCN: DS_READ_B32 -# GCN-NOT: S_WAITCNT -# GCN: FLAT_STORE_DWORD name: load_agent_monotonic body: | bb.0: + ; GCN-LABEL: name: load_agent_monotonic + ; GCN: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) + ; GCN-NEXT: $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load (s64) from `ptr addrspace(4) poison`, align 4, addrspace 4) + ; GCN-NEXT: $m0 = S_MOV_B32 -1 + ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GCN-NEXT: renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load syncscope("agent-one-as") monotonic (s32) from `ptr addrspace(2) poison`, addrspace 2) + ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 + ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec + ; GCN-NEXT: FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `ptr poison`) + ; GCN-NEXT: S_ENDPGM 0 $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load (s64) from `ptr addrspace(4) poison`, align 4, addrspace 4) $m0 = S_MOV_B32 -1 @@ -338,17 +396,21 @@ body: | ... --- -# GCN-LABEL: name: load_agent_acquire -# GCN-LABEL: bb.0: -# GCN-NOT: S_WAITCNT -# GCN: DS_READ_B32 -# GCN-NOT: S_WAITCNT -# GCN: FLAT_STORE_DWORD name: load_agent_acquire body: | bb.0: + ; GCN-LABEL: name: load_agent_acquire + ; GCN: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) + ; GCN-NEXT: $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load (s64) from `ptr addrspace(4) poison`, align 4, addrspace 4) + ; GCN-NEXT: $m0 = S_MOV_B32 -1 + ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GCN-NEXT: renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load syncscope("agent-one-as") acquire (s32) from `ptr addrspace(2) poison`, addrspace 2) + ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 + ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec + ; GCN-NEXT: FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `ptr poison`) + ; GCN-NEXT: S_ENDPGM 0 $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load (s64) from `ptr addrspace(4) poison`, align 4, addrspace 4) $m0 = S_MOV_B32 -1 @@ -362,17 +424,21 @@ body: | ... --- -# GCN-LABEL: name: load_agent_seq_cst -# GCN-LABEL: bb.0: -# GCN-NOT: S_WAITCNT -# GCN: DS_READ_B32 -# GCN-NOT: S_WAITCNT -# GCN: FLAT_STORE_DWORD name: load_agent_seq_cst body: | bb.0: + ; GCN-LABEL: name: load_agent_seq_cst + ; GCN: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) + ; GCN-NEXT: $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load (s64) from `ptr addrspace(4) poison`, align 4, addrspace 4) + ; GCN-NEXT: $m0 = S_MOV_B32 -1 + ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GCN-NEXT: renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load syncscope("agent-one-as") seq_cst (s32) from `ptr addrspace(2) poison`, addrspace 2) + ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 + ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec + ; GCN-NEXT: FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `ptr poison`) + ; GCN-NEXT: S_ENDPGM 0 $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load (s64) from `ptr addrspace(4) poison`, align 4, addrspace 4) $m0 = S_MOV_B32 -1 @@ -386,17 +452,21 @@ body: | ... --- -# GCN-LABEL: name: load_system_unordered -# GCN-LABEL: bb.0: -# GCN-NOT: S_WAITCNT -# GCN: DS_READ_B32 -# GCN-NOT: S_WAITCNT -# GCN: FLAT_STORE_DWORD name: load_system_unordered body: | bb.0: + ; GCN-LABEL: name: load_system_unordered + ; GCN: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) + ; GCN-NEXT: $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load (s64) from `ptr addrspace(4) poison`, align 4, addrspace 4) + ; GCN-NEXT: $m0 = S_MOV_B32 -1 + ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GCN-NEXT: renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load syncscope("one-as") unordered (s32) from `ptr addrspace(2) poison`, addrspace 2) + ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 + ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec + ; GCN-NEXT: FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `ptr poison`) + ; GCN-NEXT: S_ENDPGM 0 $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load (s64) from `ptr addrspace(4) poison`, align 4, addrspace 4) $m0 = S_MOV_B32 -1 @@ -410,17 +480,21 @@ body: | ... --- -# GCN-LABEL: name: load_system_monotonic -# GCN-LABEL: bb.0: -# GCN-NOT: S_WAITCNT -# GCN: DS_READ_B32 -# GCN-NOT: S_WAITCNT -# GCN: FLAT_STORE_DWORD name: load_system_monotonic body: | bb.0: + ; GCN-LABEL: name: load_system_monotonic + ; GCN: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) + ; GCN-NEXT: $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load (s64) from `ptr addrspace(4) poison`, align 4, addrspace 4) + ; GCN-NEXT: $m0 = S_MOV_B32 -1 + ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GCN-NEXT: renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load syncscope("one-as") monotonic (s32) from `ptr addrspace(2) poison`, addrspace 2) + ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 + ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec + ; GCN-NEXT: FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `ptr poison`) + ; GCN-NEXT: S_ENDPGM 0 $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load (s64) from `ptr addrspace(4) poison`, align 4, addrspace 4) $m0 = S_MOV_B32 -1 @@ -434,17 +508,21 @@ body: | ... --- -# GCN-LABEL: name: load_system_acquire -# GCN-LABEL: bb.0: -# GCN-NOT: S_WAITCNT -# GCN: DS_READ_B32 -# GCN-NOT: S_WAITCNT -# GCN: FLAT_STORE_DWORD name: load_system_acquire body: | bb.0: + ; GCN-LABEL: name: load_system_acquire + ; GCN: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) + ; GCN-NEXT: $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load (s64) from `ptr addrspace(4) poison`, align 4, addrspace 4) + ; GCN-NEXT: $m0 = S_MOV_B32 -1 + ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GCN-NEXT: renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load syncscope("one-as") acquire (s32) from `ptr addrspace(2) poison`, addrspace 2) + ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 + ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec + ; GCN-NEXT: FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `ptr poison`) + ; GCN-NEXT: S_ENDPGM 0 $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load (s64) from `ptr addrspace(4) poison`, align 4, addrspace 4) $m0 = S_MOV_B32 -1 @@ -458,17 +536,21 @@ body: | ... --- -# GCN-LABEL: name: load_system_seq_cst -# GCN-LABEL: bb.0: -# GCN-NOT: S_WAITCNT -# GCN: DS_READ_B32 -# GCN-NOT: S_WAITCNT -# GCN: FLAT_STORE_DWORD name: load_system_seq_cst body: | bb.0: + ; GCN-LABEL: name: load_system_seq_cst + ; GCN: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) + ; GCN-NEXT: $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load (s64) from `ptr addrspace(4) poison`, align 4, addrspace 4) + ; GCN-NEXT: $m0 = S_MOV_B32 -1 + ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GCN-NEXT: renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load syncscope("one-as") seq_cst (s32) from `ptr addrspace(2) poison`, addrspace 2) + ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 + ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec + ; GCN-NEXT: FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `ptr poison`) + ; GCN-NEXT: S_ENDPGM 0 $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load (s64) from `ptr addrspace(4) poison`, align 4, addrspace 4) $m0 = S_MOV_B32 -1 @@ -482,17 +564,19 @@ body: | ... --- -# GCN-LABEL: name: store_singlethread_unordered -# GCN-LABEL: bb.0: -# GCN-NOT: S_WAITCNT -# GCN: DS_WRITE_B32 -# GCN-NOT: S_WAITCNT -# GCN: S_ENDPGM 0 name: store_singlethread_unordered body: | bb.0: + ; GCN-LABEL: name: store_singlethread_unordered + ; GCN: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) + ; GCN-NEXT: $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, align 8, addrspace 4) + ; GCN-NEXT: $m0 = S_MOV_B32 -1 + ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GCN-NEXT: DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("singlethread-one-as") unordered (s32) into `ptr addrspace(2) poison`, addrspace 2) + ; GCN-NEXT: S_ENDPGM 0 $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, align 8, addrspace 4) $m0 = S_MOV_B32 -1 @@ -504,17 +588,19 @@ body: | ... --- -# GCN-LABEL: name: store_singlethread_monotonic -# GCN-LABEL: bb.0: -# GCN-NOT: S_WAITCNT -# GCN: DS_WRITE_B32 -# GCN-NOT: S_WAITCNT -# GCN: S_ENDPGM 0 name: store_singlethread_monotonic body: | bb.0: + ; GCN-LABEL: name: store_singlethread_monotonic + ; GCN: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) + ; GCN-NEXT: $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, align 8, addrspace 4) + ; GCN-NEXT: $m0 = S_MOV_B32 -1 + ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GCN-NEXT: DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("singlethread-one-as") monotonic (s32) into `ptr addrspace(2) poison`, addrspace 2) + ; GCN-NEXT: S_ENDPGM 0 $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, align 8, addrspace 4) $m0 = S_MOV_B32 -1 @@ -526,17 +612,20 @@ body: | ... --- -# GCN-LABEL: name: store_singlethread_release -# GCN-LABEL: bb.0: -# GCN-NOT: S_WAITCNT -# GCN: DS_WRITE_B32 -# GCN-NOT: S_WAITCNT -# GCN: S_ENDPGM 0 name: store_singlethread_release body: | bb.0: + ; GCN-LABEL: name: store_singlethread_release + ; GCN: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) + ; GCN-NEXT: $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, align 8, addrspace 4) + ; GCN-NEXT: $m0 = S_MOV_B32 -1 + ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GCN-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952 + ; GCN-NEXT: DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("singlethread-one-as") release (s32) into `ptr addrspace(2) poison`, addrspace 2) + ; GCN-NEXT: S_ENDPGM 0 $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, align 8, addrspace 4) $m0 = S_MOV_B32 -1 @@ -548,17 +637,20 @@ body: | ... --- -# GCN-LABEL: name: store_singlethread_seq_cst -# GCN-LABEL: bb.0: -# GCN-NOT: S_WAITCNT -# GCN: DS_WRITE_B32 -# GCN-NOT: S_WAITCNT -# GCN: S_ENDPGM 0 name: store_singlethread_seq_cst body: | bb.0: + ; GCN-LABEL: name: store_singlethread_seq_cst + ; GCN: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) + ; GCN-NEXT: $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, align 8, addrspace 4) + ; GCN-NEXT: $m0 = S_MOV_B32 -1 + ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GCN-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952 + ; GCN-NEXT: DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("singlethread-one-as") seq_cst (s32) into `ptr addrspace(2) poison`, addrspace 2) + ; GCN-NEXT: S_ENDPGM 0 $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, align 8, addrspace 4) $m0 = S_MOV_B32 -1 @@ -570,17 +662,19 @@ body: | ... --- -# GCN-LABEL: name: store_wavefront_unordered -# GCN-LABEL: bb.0: -# GCN-NOT: S_WAITCNT -# GCN: DS_WRITE_B32 -# GCN-NOT: S_WAITCNT -# GCN: S_ENDPGM 0 name: store_wavefront_unordered body: | bb.0: + ; GCN-LABEL: name: store_wavefront_unordered + ; GCN: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) + ; GCN-NEXT: $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, align 8, addrspace 4) + ; GCN-NEXT: $m0 = S_MOV_B32 -1 + ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GCN-NEXT: DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("wavefront-one-as") unordered (s32) into `ptr addrspace(2) poison`, addrspace 2) + ; GCN-NEXT: S_ENDPGM 0 $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, align 8, addrspace 4) $m0 = S_MOV_B32 -1 @@ -592,17 +686,19 @@ body: | ... --- -# GCN-LABEL: name: store_wavefront_monotonic -# GCN-LABEL: bb.0: -# GCN-NOT: S_WAITCNT -# GCN: DS_WRITE_B32 -# GCN-NOT: S_WAITCNT -# GCN: S_ENDPGM 0 name: store_wavefront_monotonic body: | bb.0: + ; GCN-LABEL: name: store_wavefront_monotonic + ; GCN: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) + ; GCN-NEXT: $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, align 8, addrspace 4) + ; GCN-NEXT: $m0 = S_MOV_B32 -1 + ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GCN-NEXT: DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("wavefront-one-as") monotonic (s32) into `ptr addrspace(2) poison`, addrspace 2) + ; GCN-NEXT: S_ENDPGM 0 $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, align 8, addrspace 4) $m0 = S_MOV_B32 -1 @@ -614,17 +710,19 @@ body: | ... --- -# GCN-LABEL: name: store_wavefront_release -# GCN-LABEL: bb.0: -# GCN-NOT: S_WAITCNT -# GCN: DS_WRITE_B32 -# GCN-NOT: S_WAITCNT -# GCN: S_ENDPGM 0 name: store_wavefront_release body: | bb.0: + ; GCN-LABEL: name: store_wavefront_release + ; GCN: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) + ; GCN-NEXT: $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, align 8, addrspace 4) + ; GCN-NEXT: $m0 = S_MOV_B32 -1 + ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GCN-NEXT: DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("wavefront-one-as") release (s32) into `ptr addrspace(2) poison`, addrspace 2) + ; GCN-NEXT: S_ENDPGM 0 $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, align 8, addrspace 4) $m0 = S_MOV_B32 -1 @@ -636,17 +734,19 @@ body: | ... --- -# GCN-LABEL: name: store_wavefront_seq_cst -# GCN-LABEL: bb.0: -# GCN-NOT: S_WAITCNT -# GCN: DS_WRITE_B32 -# GCN-NOT: S_WAITCNT -# GCN: S_ENDPGM 0 name: store_wavefront_seq_cst body: | bb.0: + ; GCN-LABEL: name: store_wavefront_seq_cst + ; GCN: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) + ; GCN-NEXT: $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, align 8, addrspace 4) + ; GCN-NEXT: $m0 = S_MOV_B32 -1 + ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GCN-NEXT: DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("wavefront-one-as") seq_cst (s32) into `ptr addrspace(2) poison`, addrspace 2) + ; GCN-NEXT: S_ENDPGM 0 $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, align 8, addrspace 4) $m0 = S_MOV_B32 -1 @@ -658,17 +758,19 @@ body: | ... --- -# GCN-LABEL: name: store_workgroup_unordered -# GCN-LABEL: bb.0: -# GCN-NOT: S_WAITCNT -# GCN: DS_WRITE_B32 -# GCN-NOT: S_WAITCNT -# GCN: S_ENDPGM 0 name: store_workgroup_unordered body: | bb.0: + ; GCN-LABEL: name: store_workgroup_unordered + ; GCN: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) + ; GCN-NEXT: $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, align 8, addrspace 4) + ; GCN-NEXT: $m0 = S_MOV_B32 -1 + ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GCN-NEXT: DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("workgroup-one-as") unordered (s32) into `ptr addrspace(2) poison`, addrspace 2) + ; GCN-NEXT: S_ENDPGM 0 $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, align 8, addrspace 4) $m0 = S_MOV_B32 -1 @@ -680,17 +782,19 @@ body: | ... --- -# GCN-LABEL: name: store_workgroup_monotonic -# GCN-LABEL: bb.0: -# GCN-NOT: S_WAITCNT -# GCN: DS_WRITE_B32 -# GCN-NOT: S_WAITCNT -# GCN: S_ENDPGM 0 name: store_workgroup_monotonic body: | bb.0: + ; GCN-LABEL: name: store_workgroup_monotonic + ; GCN: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) + ; GCN-NEXT: $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, align 8, addrspace 4) + ; GCN-NEXT: $m0 = S_MOV_B32 -1 + ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GCN-NEXT: DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("workgroup-one-as") monotonic (s32) into `ptr addrspace(2) poison`, addrspace 2) + ; GCN-NEXT: S_ENDPGM 0 $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, align 8, addrspace 4) $m0 = S_MOV_B32 -1 @@ -702,17 +806,19 @@ body: | ... --- -# GCN-LABEL: name: store_workgroup_release -# GCN-LABEL: bb.0: -# GCN-NOT: S_WAITCNT -# GCN: DS_WRITE_B32 -# GCN-NOT: S_WAITCNT -# GCN: S_ENDPGM 0 name: store_workgroup_release body: | bb.0: + ; GCN-LABEL: name: store_workgroup_release + ; GCN: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) + ; GCN-NEXT: $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, align 8, addrspace 4) + ; GCN-NEXT: $m0 = S_MOV_B32 -1 + ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GCN-NEXT: DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("workgroup-one-as") release (s32) into `ptr addrspace(2) poison`, addrspace 2) + ; GCN-NEXT: S_ENDPGM 0 $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, align 8, addrspace 4) $m0 = S_MOV_B32 -1 @@ -724,17 +830,19 @@ body: | ... --- -# GCN-LABEL: name: store_workgroup_seq_cst -# GCN-LABEL: bb.0: -# GCN-NOT: S_WAITCNT -# GCN: DS_WRITE_B32 -# GCN-NOT: S_WAITCNT -# GCN: S_ENDPGM 0 name: store_workgroup_seq_cst body: | bb.0: + ; GCN-LABEL: name: store_workgroup_seq_cst + ; GCN: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) + ; GCN-NEXT: $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, align 8, addrspace 4) + ; GCN-NEXT: $m0 = S_MOV_B32 -1 + ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GCN-NEXT: DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("workgroup-one-as") seq_cst (s32) into `ptr addrspace(2) poison`, addrspace 2) + ; GCN-NEXT: S_ENDPGM 0 $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, align 8, addrspace 4) $m0 = S_MOV_B32 -1 @@ -746,17 +854,19 @@ body: | ... --- -# GCN-LABEL: name: store_agent_unordered -# GCN-LABEL: bb.0: -# GCN-NOT: S_WAITCNT -# GCN: DS_WRITE_B32 -# GCN-NOT: S_WAITCNT -# GCN: S_ENDPGM 0 name: store_agent_unordered body: | bb.0: + ; GCN-LABEL: name: store_agent_unordered + ; GCN: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) + ; GCN-NEXT: $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, align 8, addrspace 4) + ; GCN-NEXT: $m0 = S_MOV_B32 -1 + ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GCN-NEXT: DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("agent-one-as") unordered (s32) into `ptr addrspace(2) poison`, addrspace 2) + ; GCN-NEXT: S_ENDPGM 0 $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, align 8, addrspace 4) $m0 = S_MOV_B32 -1 @@ -768,17 +878,19 @@ body: | ... --- -# GCN-LABEL: name: store_agent_monotonic -# GCN-LABEL: bb.0: -# GCN-NOT: S_WAITCNT -# GCN: DS_WRITE_B32 -# GCN-NOT: S_WAITCNT -# GCN: S_ENDPGM 0 name: store_agent_monotonic body: | bb.0: + ; GCN-LABEL: name: store_agent_monotonic + ; GCN: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) + ; GCN-NEXT: $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, align 8, addrspace 4) + ; GCN-NEXT: $m0 = S_MOV_B32 -1 + ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GCN-NEXT: DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("agent-one-as") monotonic (s32) into `ptr addrspace(2) poison`, addrspace 2) + ; GCN-NEXT: S_ENDPGM 0 $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, align 8, addrspace 4) $m0 = S_MOV_B32 -1 @@ -790,17 +902,19 @@ body: | ... --- -# GCN-LABEL: name: store_agent_release -# GCN-LABEL: bb.0: -# GCN-NOT: S_WAITCNT -# GCN: DS_WRITE_B32 -# GCN-NOT: S_WAITCNT -# GCN: S_ENDPGM 0 name: store_agent_release body: | bb.0: + ; GCN-LABEL: name: store_agent_release + ; GCN: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) + ; GCN-NEXT: $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, align 8, addrspace 4) + ; GCN-NEXT: $m0 = S_MOV_B32 -1 + ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GCN-NEXT: DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("agent-one-as") release (s32) into `ptr addrspace(2) poison`, addrspace 2) + ; GCN-NEXT: S_ENDPGM 0 $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, align 8, addrspace 4) $m0 = S_MOV_B32 -1 @@ -812,17 +926,19 @@ body: | ... --- -# GCN-LABEL: name: store_agent_seq_cst -# GCN-LABEL: bb.0: -# GCN-NOT: S_WAITCNT -# GCN: DS_WRITE_B32 -# GCN-NOT: S_WAITCNT -# GCN: S_ENDPGM 0 name: store_agent_seq_cst body: | bb.0: + ; GCN-LABEL: name: store_agent_seq_cst + ; GCN: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) + ; GCN-NEXT: $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, align 8, addrspace 4) + ; GCN-NEXT: $m0 = S_MOV_B32 -1 + ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GCN-NEXT: DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("agent-one-as") seq_cst (s32) into `ptr addrspace(2) poison`, addrspace 2) + ; GCN-NEXT: S_ENDPGM 0 $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, align 8, addrspace 4) $m0 = S_MOV_B32 -1 @@ -834,17 +950,19 @@ body: | ... --- -# GCN-LABEL: name: store_system_unordered -# GCN-LABEL: bb.0: -# GCN-NOT: S_WAITCNT -# GCN: DS_WRITE_B32 -# GCN-NOT: S_WAITCNT -# GCN: S_ENDPGM 0 name: store_system_unordered body: | bb.0: + ; GCN-LABEL: name: store_system_unordered + ; GCN: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) + ; GCN-NEXT: $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, align 8, addrspace 4) + ; GCN-NEXT: $m0 = S_MOV_B32 -1 + ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GCN-NEXT: DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store unordered (s32) into `ptr addrspace(2) poison`, addrspace 2) + ; GCN-NEXT: S_ENDPGM 0 $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, align 8, addrspace 4) $m0 = S_MOV_B32 -1 @@ -856,17 +974,19 @@ body: | ... --- -# GCN-LABEL: name: store_system_monotonic -# GCN-LABEL: bb.0: -# GCN-NOT: S_WAITCNT -# GCN: DS_WRITE_B32 -# GCN-NOT: S_WAITCNT -# GCN: S_ENDPGM 0 name: store_system_monotonic body: | bb.0: + ; GCN-LABEL: name: store_system_monotonic + ; GCN: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) + ; GCN-NEXT: $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, align 8, addrspace 4) + ; GCN-NEXT: $m0 = S_MOV_B32 -1 + ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GCN-NEXT: DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store monotonic (s32) into `ptr addrspace(2) poison`, addrspace 2) + ; GCN-NEXT: S_ENDPGM 0 $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, align 8, addrspace 4) $m0 = S_MOV_B32 -1 @@ -878,17 +998,19 @@ body: | ... --- -# GCN-LABEL: name: store_system_release -# GCN-LABEL: bb.0: -# GCN-NOT: S_WAITCNT -# GCN: DS_WRITE_B32 -# GCN-NOT: S_WAITCNT -# GCN: S_ENDPGM 0 name: store_system_release body: | bb.0: + ; GCN-LABEL: name: store_system_release + ; GCN: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) + ; GCN-NEXT: $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, align 8, addrspace 4) + ; GCN-NEXT: $m0 = S_MOV_B32 -1 + ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GCN-NEXT: DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("one-as") release (s32) into `ptr addrspace(2) poison`, addrspace 2) + ; GCN-NEXT: S_ENDPGM 0 $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, align 8, addrspace 4) $m0 = S_MOV_B32 -1 @@ -900,17 +1022,19 @@ body: | ... --- -# GCN-LABEL: name: store_system_seq_cst -# GCN-LABEL: bb.0: -# GCN-NOT: S_WAITCNT -# GCN: DS_WRITE_B32 -# GCN-NOT: S_WAITCNT -# GCN: S_ENDPGM 0 name: store_system_seq_cst body: | bb.0: + ; GCN-LABEL: name: store_system_seq_cst + ; GCN: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) + ; GCN-NEXT: $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, align 8, addrspace 4) + ; GCN-NEXT: $m0 = S_MOV_B32 -1 + ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GCN-NEXT: DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("one-as") seq_cst (s32) into `ptr addrspace(2) poison`, addrspace 2) + ; GCN-NEXT: S_ENDPGM 0 $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, align 8, addrspace 4) $m0 = S_MOV_B32 -1 @@ -922,17 +1046,19 @@ body: | ... --- -# GCN-LABEL: name: atomicrmw_singlethread_unordered -# GCN-LABEL: bb.0: -# GCN-NOT: S_WAITCNT -# GCN: DS_WRXCHG_RTN_B32 -# GCN-NOT: S_WAITCNT -# GCN: S_ENDPGM 0 name: atomicrmw_singlethread_unordered body: | bb.0: + ; GCN-LABEL: name: atomicrmw_singlethread_unordered + ; GCN: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) + ; GCN-NEXT: $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, align 8, addrspace 4) + ; GCN-NEXT: $m0 = S_MOV_B32 -1 + ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GCN-NEXT: $vgpr2 = DS_WRXCHG_RTN_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("singlethread-one-as") unordered (s32) into `ptr addrspace(2) poison`, addrspace 2) + ; GCN-NEXT: S_ENDPGM 0 $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, align 8, addrspace 4) $m0 = S_MOV_B32 -1 @@ -944,17 +1070,19 @@ body: | ... --- -# GCN-LABEL: name: atomicrmw_singlethread_monotonic -# GCN-LABEL: bb.0: -# GCN-NOT: S_WAITCNT -# GCN: DS_WRXCHG_RTN_B32 -# GCN-NOT: S_WAITCNT -# GCN: S_ENDPGM 0 name: atomicrmw_singlethread_monotonic body: | bb.0: + ; GCN-LABEL: name: atomicrmw_singlethread_monotonic + ; GCN: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) + ; GCN-NEXT: $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, align 8, addrspace 4) + ; GCN-NEXT: $m0 = S_MOV_B32 -1 + ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GCN-NEXT: $vgpr2 = DS_WRXCHG_RTN_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("singlethread-one-as") monotonic (s32) into `ptr addrspace(2) poison`, addrspace 2) + ; GCN-NEXT: S_ENDPGM 0 $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, align 8, addrspace 4) $m0 = S_MOV_B32 -1 @@ -966,17 +1094,19 @@ body: | ... --- -# GCN-LABEL: name: atomicrmw_singlethread_acquire -# GCN-LABEL: bb.0: -# GCN-NOT: S_WAITCNT -# GCN: DS_WRXCHG_RTN_B32 -# GCN-NOT: S_WAITCNT -# GCN: S_ENDPGM 0 name: atomicrmw_singlethread_acquire body: | bb.0: + ; GCN-LABEL: name: atomicrmw_singlethread_acquire + ; GCN: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) + ; GCN-NEXT: $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, align 8, addrspace 4) + ; GCN-NEXT: $m0 = S_MOV_B32 -1 + ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GCN-NEXT: $vgpr2 = DS_WRXCHG_RTN_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("singlethread-one-as") acquire (s32) into `ptr addrspace(2) poison`, addrspace 2) + ; GCN-NEXT: S_ENDPGM 0 $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, align 8, addrspace 4) $m0 = S_MOV_B32 -1 @@ -988,17 +1118,20 @@ body: | ... --- -# GCN-LABEL: name: atomicrmw_singlethread_release -# GCN-LABEL: bb.0: -# GCN-NOT: S_WAITCNT -# GCN: DS_WRXCHG_RTN_B32 -# GCN-NOT: S_WAITCNT -# GCN: S_ENDPGM 0 name: atomicrmw_singlethread_release body: | bb.0: + ; GCN-LABEL: name: atomicrmw_singlethread_release + ; GCN: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) + ; GCN-NEXT: $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, align 8, addrspace 4) + ; GCN-NEXT: $m0 = S_MOV_B32 -1 + ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GCN-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952 + ; GCN-NEXT: $vgpr2 = DS_WRXCHG_RTN_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("singlethread-one-as") release (s32) into `ptr addrspace(2) poison`, addrspace 2) + ; GCN-NEXT: S_ENDPGM 0 $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, align 8, addrspace 4) $m0 = S_MOV_B32 -1 @@ -1010,17 +1143,20 @@ body: | ... --- -# GCN-LABEL: name: atomicrmw_singlethread_acq_rel -# GCN-LABEL: bb.0: -# GCN-NOT: S_WAITCNT -# GCN: DS_WRXCHG_RTN_B32 -# GCN-NOT: S_WAITCNT -# GCN: S_ENDPGM 0 name: atomicrmw_singlethread_acq_rel body: | bb.0: + ; GCN-LABEL: name: atomicrmw_singlethread_acq_rel + ; GCN: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) + ; GCN-NEXT: $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, align 8, addrspace 4) + ; GCN-NEXT: $m0 = S_MOV_B32 -1 + ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GCN-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952 + ; GCN-NEXT: $vgpr2 = DS_WRXCHG_RTN_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("singlethread-one-as") acq_rel (s32) into `ptr addrspace(2) poison`, addrspace 2) + ; GCN-NEXT: S_ENDPGM 0 $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, align 8, addrspace 4) $m0 = S_MOV_B32 -1 @@ -1032,17 +1168,20 @@ body: | ... --- -# GCN-LABEL: name: atomicrmw_singlethread_seq_cst -# GCN-LABEL: bb.0: -# GCN-NOT: S_WAITCNT -# GCN: DS_WRXCHG_RTN_B32 -# GCN-NOT: S_WAITCNT -# GCN: S_ENDPGM 0 name: atomicrmw_singlethread_seq_cst body: | bb.0: + ; GCN-LABEL: name: atomicrmw_singlethread_seq_cst + ; GCN: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) + ; GCN-NEXT: $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, align 8, addrspace 4) + ; GCN-NEXT: $m0 = S_MOV_B32 -1 + ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GCN-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952 + ; GCN-NEXT: $vgpr2 = DS_WRXCHG_RTN_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("singlethread-one-as") seq_cst (s32) into `ptr addrspace(2) poison`, addrspace 2) + ; GCN-NEXT: S_ENDPGM 0 $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, align 8, addrspace 4) $m0 = S_MOV_B32 -1